In [1]:
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
import numpy as np

from src.utilities import run_cv, generate_backbone
from src.ToyModel import ToyModel
from src.FeatureGenerator import FeatureGenerator
from src.settings import RAW_PATH, PROCESSED_PATH, SHIFTS, WINS, COLS_MIN_MAX, GROUP_COLS, ROLL_FUNCS

# Initial merge of `sales_train` and `items`

In [2]:
sales_train = pd.read_csv(RAW_PATH + 'sales_train.csv')
items = pd.read_csv(RAW_PATH + 'items.csv')[['item_id', 'item_category_id']]

In [3]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
items.head()

Unnamed: 0,item_id,item_category_id
0,0,40
1,1,76
2,2,40
3,3,40
4,4,40


In [5]:
merged_df = sales_train.merge(items, how='left', on='item_id')

In [27]:
merged_df.isna().sum()

date                0
date_block_num      0
shop_id             0
item_id             0
item_price          0
item_cnt_day        0
item_category_id    0
dtype: int64

In [29]:
merged_df.to_parquet(PROCESSED_PATH + 'merged_train_df.parquet', index=False)

# Constructing target

This will be a simpler version of target where instead of rolling window of 30 days we will simply use the month id to sum aggregate number of items sold. We will calculate this for each shop and item, add missing months with 0 sales and shift the aggregate by 1 month.

This functionality has been moved to src.TestGenerator

## Building features

## Features describing sales for `shop_id` and `date_block_num`

For all feature construction we have to use expanded dataset to preserve the sequence of months. If some months are missing, than shifts and window aggregates would not be correct since we are not working with the datetime column here.

This functionality has been implemented and moved to src.FeatureGenerator

### Counts of deals per month and per shop, lags, rolling aggregates

This functionality has been implemented and moved to src.FeatureGenerator

### Aggregates over prices per month and per shop, lags, window aggregates

This functionality has been implemented and moved to src.FeatureGenerator

## Features describing sales for `shop_id`, `category_id` and `date_block_num`

In [2]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
local_df = merged_df[['date_block_num', 'item_id', 'shop_id', 'item_cnt_day', 'item_price']].reset_index(drop=True).copy()

index_cols = ['shop_id', 'item_id', 'date_block_num']
base_cols = ['item_price', 'item_cnt_day']
target_col = ['target']

res_df = generate_backbone()

In [3]:
# base aggregates
agg_di = {col: ROLL_FUNCS for col in base_cols}

for k, group in GROUP_COLS.items():
    agg_df = local_df.groupby(group, as_index = False).agg(agg_di)
    agg_df.columns = ['_'.join(col) + f'_per_{k}' if col[1] else col[0] for col in agg_df.columns ]
    res_df = res_df.merge(agg_df, how='left', on=group).fillna(0)

res_df = res_df.rename(columns={'item_cnt_day_sum_per_shop_item': 'target'})
base_feat_cols = [col for col in res_df if col not in index_cols + base_cols + target_col]

In [4]:
# lag aggregates
cols_to_shift = base_feat_cols + target_col
shifted_cols = []
for shift in SHIFTS:
    preshift_df = res_df[index_cols + cols_to_shift].copy()
    preshift_df['date_block_num'] = preshift_df['date_block_num'] + shift
    rename_dict = {col: f'{col}_lag_{shift}' for col in cols_to_shift}
    shifted_cols += [f'{col}_lag_{shift}' for col in cols_to_shift]
    preshift_df = preshift_df.rename(columns = rename_dict)
    
    res_df = res_df.merge(preshift_df, how='left', on=index_cols).fillna(0)

In [5]:
# rolling window aggregates
roll_cols = []
col = target_col[0]
for win_len in WINS:
    # groupping_k = col.split('_per_')[1] if not col == target_col[0] else 'shop_item'
    group = GROUP_COLS['shop_item']
    roll_df = res_df[group + [col]].drop_duplicates().sort_values(group)
    new_name = f'{col}_roll_mean_{win_len}'
    roll_df = roll_df.groupby(group[:-1], as_index=False)\
                        .rolling(win_len, on='date_block_num', closed='right')[col].mean().fillna(0).reset_index()\
                        .rename(columns={col: new_name})
    res_df = res_df.merge(roll_df, how='left', on=group)
    roll_cols.append(new_name)
    # print(roll_df.head())
    # print('-'*30)
    # tmp.groupby('shop_id', as_index=False).rolling(2, on='date_block_num')[base_feat_cols[0]].sum()

In [6]:
res_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_price_sum_per_shop,item_price_mean_per_shop,item_cnt_day_sum_per_shop,item_cnt_day_mean_per_shop,item_price_sum_per_item,item_price_mean_per_item,item_cnt_day_sum_per_item,...,item_price_sum_per_item_lag_12,item_price_mean_per_item_lag_12,item_cnt_day_sum_per_item_lag_12,item_cnt_day_mean_per_item_lag_12,item_price_sum_per_shop_item_lag_12,item_price_mean_per_shop_item_lag_12,item_cnt_day_mean_per_shop_item_lag_12,target_lag_12,target_roll_mean_3,target_roll_mean_9
0,26,0,0,1457946.0,680.012042,2331.0,1.08722,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26,0,1,1525165.0,681.790499,2597.0,1.16093,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,26,0,2,1876700.0,761.33865,3036.0,1.231643,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26,0,3,1328178.0,633.975111,2381.0,1.136516,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26,0,4,1193884.0,669.217327,2148.0,1.204036,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


This functionality has been moved to src.FeatureGenerator

## Feature describing sales for `shop_id`, `item_id` and `date_block_num`

This functionality has been implemented and moved to src.FeatureGenerator

## Autoregression features

This functionality has been implemented and moved to src.FeatureGenerator

So we've created a dataset where for every month we have some descriptive features about sales and profits. We also have a target column that contains the number of items sold in the given shop. We can now train a model to predic number of items sold in the following month.

# Feature generation at inference time

Let's verify that we can create feature dataset to predict target for a given test.csv.

FeatureGenerator and TestGenerator classes have been created and moved to src. We've imported it above and will demonstrate usage for the cases of crossvalidation and inference for a provided test index backbone here.

In [5]:
feat_generator = FeatureGenerator(target_months=[11])

# suppose we want to make 11 the target month, i.e. the month we predict to
test_features_df = feat_generator.generate_features()
test_features_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_price_sum_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_price_sum_per_item_lag_1,item_cnt_day_sum_per_item_lag_1,item_price_sum_per_shop_item_lag_1,target_lag_1,item_price_sum_per_shop_lag_2,...,item_price_sum_per_shop_item_lag_6,target_lag_6,item_price_sum_per_shop_lag_12,item_cnt_day_sum_per_shop_lag_12,item_price_sum_per_item_lag_12,item_cnt_day_sum_per_item_lag_12,item_price_sum_per_shop_item_lag_12,target_lag_12,target_roll_mean_3,target_roll_mean_9
11,26,0,11,1433133.0,2409.0,0.0,0.0,0.0,0.0,1421026.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,26,1,11,1433133.0,2409.0,0.0,0.0,0.0,0.0,1421026.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,26,2,11,1433133.0,2409.0,0.0,0.0,0.0,0.0,1421026.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47,26,3,11,1433133.0,2409.0,0.0,0.0,0.0,0.0,1421026.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,26,4,11,1433133.0,2409.0,0.0,0.0,0.0,0.0,1421026.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


So, features for a generated shop-item backbone and a given month are generated correctly. Let's now use this functionality
to generate features for the provided test index backbone.

In [7]:
# we are told that the test month is November, so following the last one in the train set, so 34-th 
test_month_num = 34 

# test_backbone in this case is the provided test dataset itself
test_backbone = pd.read_csv(RAW_PATH + 'test.csv')
feat_generator = FeatureGenerator(target_months=[test_month_num])

test_features_df = feat_generator.add_features_to_backbone(test_backbone=test_backbone)
test_features_df

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_price_sum_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_price_sum_per_item_lag_1,item_cnt_day_sum_per_item_lag_1,item_price_sum_per_shop_item_lag_1,target_lag_1,...,item_price_sum_per_shop_item_lag_6,target_lag_6,item_price_sum_per_shop_lag_12,item_cnt_day_sum_per_shop_lag_12,item_price_sum_per_item_lag_12,item_cnt_day_sum_per_item_lag_12,item_price_sum_per_shop_item_lag_12,target_lag_12,target_roll_mean_3,target_roll_mean_9
0,0,5,5037,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,5,5320,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,5,5232,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,5,5268,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,214195,45,18454,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214196,214196,45,16188,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214197,214197,45,15757,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214198,214198,45,19648,34,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
train_shop_range = range(COLS_MIN_MAX['shop_id'][0], COLS_MIN_MAX['shop_id'][1]+1)
test_features_df[test_features_df['shop_id'].isin(train_shop_range)]

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_price_sum_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_price_sum_per_item_lag_1,item_cnt_day_sum_per_item_lag_1,item_price_sum_per_shop_item_lag_1,target_lag_1,...,item_price_sum_per_shop_item_lag_6,target_lag_6,item_price_sum_per_shop_lag_12,item_cnt_day_sum_per_shop_lag_12,item_price_sum_per_item_lag_12,item_cnt_day_sum_per_item_lag_12,item_price_sum_per_shop_item_lag_12,target_lag_12,target_roll_mean_3,target_roll_mean_9
40800,40800,28,5037,34,2.866024e+06,3018.0,37475.0,25.0,2998.0,2.0,...,7796.0,5.0,4.662206e+06,5492.0,167025.29,65.0,2599.0,1.0,3.333333,3.111111
40801,40801,28,5320,34,2.866024e+06,3018.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.662206e+06,5492.0,0.00,0.0,0.0,0.0,0.000000,0.000000
40802,40802,28,5233,34,2.866024e+06,3018.0,49159.0,42.0,4796.0,4.0,...,2997.0,3.0,4.662206e+06,5492.0,0.00,0.0,0.0,0.0,3.333333,2.666667
40803,40803,28,5232,34,2.866024e+06,3018.0,35713.0,28.0,4796.0,4.0,...,0.0,0.0,4.662206e+06,5492.0,0.00,0.0,0.0,0.0,2.333333,0.777778
40804,40804,28,5268,34,2.866024e+06,3018.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.662206e+06,5492.0,0.00,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56095,56095,26,18454,34,1.413624e+06,1409.0,198.0,2.0,0.0,0.0,...,199.0,1.0,1.854890e+06,2061.0,0.00,0.0,0.0,0.0,0.000000,0.333333
56096,56096,26,16188,34,1.413624e+06,1409.0,1359.0,1.0,0.0,0.0,...,0.0,0.0,1.854890e+06,2061.0,0.00,0.0,0.0,0.0,0.000000,0.000000
56097,56097,26,15757,34,1.413624e+06,1409.0,1145.0,5.0,0.0,0.0,...,229.0,1.0,1.854890e+06,2061.0,1791.00,9.0,0.0,0.0,0.000000,0.111111
56098,56098,26,19648,34,1.413624e+06,1409.0,178.2,2.0,0.0,0.0,...,0.0,0.0,1.854890e+06,2061.0,0.00,0.0,0.0,0.0,0.000000,0.111111


# Cross validation

In here we will check if existing functionality is enough to do a round of crossvalidation.
Overall idea:
- data for all folds is generated using TestGenerator and FeatureGenerator classes
- data is split to train/test by order number of month using sklearn.TimeSeriesSplit

In [53]:
# TODO: consider introducing 1-2 months gap between train and test
# TODO: it should be possible to use sklearn cross validation functionality: 
# 1) build df with all features and offload it to data/processed 
# 2) build custom layer to sample from the dataset using sklearn.TimeSeriesSplit based on months

In [12]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
all_months = merged_df['date_block_num'].unique()
all_months = all_months[all_months > max(WINS+SHIFTS)] # leaving enough months for longest shift/window calculation
print(f'all months: {all_months}\n len: {len(all_months)}')

tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)
for i, (train_index, test_index) in enumerate(tscv.split(all_months)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

all months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33]
 len: 21
Fold 0:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10]
  Test:  index=[11 12]
Fold 1:
  Train: index=[ 2  3  4  5  6  7  8  9 10 11 12]
  Test:  index=[13 14]
Fold 2:
  Train: index=[ 4  5  6  7  8  9 10 11 12 13 14]
  Test:  index=[15 16]
Fold 3:
  Train: index=[ 6  7  8  9 10 11 12 13 14 15 16]
  Test:  index=[17 18]
Fold 4:
  Train: index=[ 8  9 10 11 12 13 14 15 16 17 18]
  Test:  index=[19 20]


Let's try the implemented functionality to run a simple time series cross validation. We will use a simple mean predictor as a model for now.

In [2]:
# initiating data generators
feat_generator = FeatureGenerator()

# initiating toy model
model = ToyModel()

# initiating cv splitter
tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)

# generating all of the data which we will iterate over during CV
features_df = feat_generator.generate_features()

# creating col lists for training
cols_di={
    'index': feat_generator.index_cols,
    'target': feat_generator.target_col,
    'feats': feat_generator.shifted_cols + feat_generator.roll_cols
}


# iterating over CV folds
cv_res = run_cv(df=features_df, months_cv_split=tscv, model=model, cols_di=cols_di, verbose=2)

133020 731610
Fold 0:
  Train months: [13 14 15 16 17 18 19 20 21 22 23]
  Test months: [24 25]
  NRMSE:  1.0
  RMSE :  2.7


------------------------------
133020 731610
Fold 1:
  Train months: [15 16 17 18 19 20 21 22 23 24 25]
  Test months: [26 27]
  NRMSE:  1.0
  RMSE :  1.7


------------------------------
133020 731610
Fold 2:
  Train months: [17 18 19 20 21 22 23 24 25 26 27]
  Test months: [28 29]
  NRMSE:  1.0
  RMSE :  1.9


------------------------------
133020 731610
Fold 3:
  Train months: [19 20 21 22 23 24 25 26 27 28 29]
  Test months: [30 31]
  NRMSE:  1.0
  RMSE :  1.4


------------------------------
133020 731610
Fold 4:
  Train months: [21 22 23 24 25 26 27 28 29 30 31]
  Test months: [32 33]
  NRMSE:  1.0
  RMSE :  1.3


------------------------------
RMSE mean: 1.8
NRMSE mean: 1.0


We see that functionality built is sufficient and we can now go on to try more complex models.

In [4]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

model = LinearRegression()
# model = XGBRegressor()

# creating col lists for training
cols_di={
    'index': feat_generator.index_cols,
    'target': feat_generator.target_col,
    'feats': feat_generator.roll_cols + feat_generator.shifted_cols 
}

# iterating over CV folds
cv_res = run_cv(df=features_df, months_cv_split=tscv, model=model, cols_di=cols_di, verbose=2)

133020 731610
Fold 0:
  Train months: [13 14 15 16 17 18 19 20 21 22 23]
  Test months: [24 25]
  NRMSE:  0.41
  RMSE :  1.1


------------------------------
133020 731610
Fold 1:
  Train months: [15 16 17 18 19 20 21 22 23 24 25]
  Test months: [26 27]
  NRMSE:  0.49
  RMSE :  0.84


------------------------------
133020 731610
Fold 2:
  Train months: [17 18 19 20 21 22 23 24 25 26 27]
  Test months: [28 29]
  NRMSE:  0.48
  RMSE :  0.88


------------------------------
133020 731610
Fold 3:
  Train months: [19 20 21 22 23 24 25 26 27 28 29]
  Test months: [30 31]
  NRMSE:  0.28
  RMSE :  0.39


------------------------------
133020 731610
Fold 4:
  Train months: [21 22 23 24 25 26 27 28 29 30 31]
  Test months: [32 33]
  NRMSE:  0.39
  RMSE :  0.5


------------------------------
RMSE mean: 0.75
NRMSE mean: 0.41
