In [10]:
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd

from src.utilities import run_cv, generate_backbone, check_folder
from src.ToyModel import ToyModel
from src.FeatureGenerator import FeatureGenerator
from src.settings import RAW_PATH, PROCESSED_PATH, SHIFTS, WINS, COLS_MIN_MAX, GROUP_COLS, ROLL_FUNCS

Before running this notebook, please put raw files in `data/raw/` and check respective path in src.settings

# Initial merge of `sales_train` and `items`

In [4]:
sales_train = pd.read_csv(RAW_PATH + 'sales_train.csv')
items = pd.read_csv(RAW_PATH + 'items.csv')[['item_id', 'item_category_id']]

In [5]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [6]:
items.head()

Unnamed: 0,item_id,item_category_id
0,0,40
1,1,76
2,2,40
3,3,40
4,4,40


In [7]:
merged_df = sales_train.merge(items, how='left', on='item_id')

In [8]:
merged_df.isna().sum()

date                0
date_block_num      0
shop_id             0
item_id             0
item_price          0
item_cnt_day        0
item_category_id    0
dtype: int64

In [11]:
# creating folder for processed data
check_folder(PROCESSED_PATH, flash_folder=False)

In [12]:
merged_df.to_parquet(PROCESSED_PATH + 'merged_train_df.parquet', index=False)

# Constructing target

This will be a simpler version of target where instead of rolling window of 30 days we will simply use the month id to sum aggregate number of items sold. We will calculate this for each shop and item, add missing months with 0 sales and shift the aggregate by 1 month.

This functionality has been moved to src.TestGenerator

## Building features

## Features describing sales for `shop_id` and `date_block_num`

For all feature construction we have to use expanded dataset to preserve the sequence of months. If some months are missing, than shifts and window aggregates would not be correct since we are not working with the datetime column here.

This functionality has been implemented and moved to src.FeatureGenerator

### Counts of deals per month and per shop, lags, rolling aggregates

This functionality has been implemented and moved to src.FeatureGenerator

### Aggregates over prices per month and per shop, lags, window aggregates

This functionality has been implemented and moved to src.FeatureGenerator

## Features describing sales for `shop_id`, `category_id` and `date_block_num`

In [None]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
local_df = merged_df[['date_block_num', 'item_id', 'shop_id', 'item_cnt_day', 'item_price']].reset_index(drop=True).copy()

index_cols = ['shop_id', 'item_id', 'date_block_num']
base_cols = ['item_price', 'item_cnt_day']
target_col = ['target']

res_df = generate_backbone()

In [None]:
# base aggregates
agg_di = {col: ROLL_FUNCS for col in base_cols}

for k, group in GROUP_COLS.items():
    agg_df = local_df.groupby(group, as_index = False).agg(agg_di)
    agg_df.columns = ['_'.join(col) + f'_per_{k}' if col[1] else col[0] for col in agg_df.columns ]
    res_df = res_df.merge(agg_df, how='left', on=group).fillna(0)

res_df = res_df.rename(columns={'item_cnt_day_sum_per_shop_item': 'target'})
base_feat_cols = [col for col in res_df if col not in index_cols + base_cols + target_col]

In [None]:
# lag aggregates
cols_to_shift = base_feat_cols + target_col
shifted_cols = []
for shift in SHIFTS:
    preshift_df = res_df[index_cols + cols_to_shift].copy()
    preshift_df['date_block_num'] = preshift_df['date_block_num'] + shift
    rename_dict = {col: f'{col}_lag_{shift}' for col in cols_to_shift}
    shifted_cols += [f'{col}_lag_{shift}' for col in cols_to_shift]
    preshift_df = preshift_df.rename(columns = rename_dict)
    
    res_df = res_df.merge(preshift_df, how='left', on=index_cols).fillna(0)

In [None]:
# rolling window aggregates
roll_cols = []
col = target_col[0]
for win_len in WINS:
    # groupping_k = col.split('_per_')[1] if not col == target_col[0] else 'shop_item'
    group = GROUP_COLS['shop_item']
    roll_df = res_df[group + [col]].drop_duplicates().sort_values(group)
    new_name = f'{col}_roll_mean_{win_len}'
    roll_df = roll_df.groupby(group[:-1], as_index=False)\
                        .rolling(win_len, on='date_block_num', closed='right')[col].mean().fillna(0).reset_index()\
                        .rename(columns={col: new_name})
    res_df = res_df.merge(roll_df, how='left', on=group)
    roll_cols.append(new_name)
    # print(roll_df.head())
    # print('-'*30)
    # tmp.groupby('shop_id', as_index=False).rolling(2, on='date_block_num')[base_feat_cols[0]].sum()

In [None]:
res_df.head()

This functionality has been moved to src.FeatureGenerator

This functionality has been implemented and moved to src.FeatureGenerator

## Features describing sales for `shop_id`, `item_id` and `date_block_num`

## Autoregression features

This functionality has been implemented and moved to src.FeatureGenerator

## Features describing sales of a given category in general and for a given shop

In [3]:
GROUP_COLS = {
    'category': ['item_category_id', 'date_block_num'],
    'shop_category': ['shop_id', 'item_category_id', 'date_block_num']
}

index_cols = ['shop_id', 'item_id', 'date_block_num']
base_cols = ['item_price', 'item_cnt_day']
target_col = ['target']
cat_col = ['item_category_id']

In [14]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
local_df = merged_df[['date_block_num', 'item_id', 'shop_id', 
                      'item_cnt_day', 'item_price', 'item_category_id']].reset_index(drop=True).copy()
item_cat_map = local_df[['item_id', 'item_category_id']].drop_duplicates()
res_df = generate_backbone().merge(item_cat_map, how='left')
agg_di = {col: ROLL_FUNCS for col in base_cols}

for k, group in GROUP_COLS.items():
    agg_df = local_df.groupby(group, as_index = False).agg(agg_di)
    agg_df.columns = ['_'.join(col) + f'_per_{k}' if col[1] else col[0] for col in agg_df.columns ]
    print(k)
    res_df = res_df.merge(agg_df, how='left', on=group).fillna(0)

# res_df = res_df.rename(columns={'item_cnt_day_sum_per_shop_item': 'target'})
base_feat_cols = [str(col) for col in res_df if col not in 
                  index_cols + base_cols + target_col + cat_col]

category
shop_category


In [15]:
base_feat_cols

['item_price_sum_per_category',
 'item_price_mean_per_category',
 'item_cnt_day_sum_per_category',
 'item_cnt_day_mean_per_category',
 'item_price_sum_per_shop_category',
 'item_price_mean_per_shop_category',
 'item_cnt_day_sum_per_shop_category',
 'item_cnt_day_mean_per_shop_category']

In [11]:
res_df

Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,item_price_sum_per_category,item_price_mean_per_category,item_cnt_day_sum_per_category,item_cnt_day_mean_per_category,item_price_sum_per_shop_category,item_price_mean_per_shop_category,item_cnt_day_sum_per_shop_category,item_cnt_day_mean_per_shop_category
0,26,0,0,40.0,7.145102e+06,244.485942,33489.0,1.145902,152869.000000,261.314530,607.0,1.037607
1,26,0,1,40.0,6.771906e+06,252.174936,31649.0,1.178558,151206.566667,258.915354,631.0,1.080479
2,26,0,2,40.0,8.023436e+06,260.645016,36277.0,1.178475,174743.266667,264.361977,725.0,1.096823
3,26,0,3,40.0,6.045864e+06,254.852406,27525.0,1.160266,156502.833333,266.161281,647.0,1.100340
4,26,0,4,40.0,5.554534e+06,247.627583,25470.0,1.135482,115066.833333,252.339547,488.0,1.070175
...,...,...,...,...,...,...,...,...,...,...,...,...
2261335,28,22169,29,69.0,5.913720e+05,938.685683,660.0,1.047619,28744.000000,1105.538462,27.0,1.038462
2261336,28,22169,30,69.0,7.748948e+05,738.698594,1124.0,1.071497,34694.150000,963.726389,36.0,1.000000
2261337,28,22169,31,69.0,7.737686e+05,888.368037,901.0,1.034443,27395.000000,1095.800000,25.0,1.000000
2261338,28,22169,32,69.0,6.168589e+05,947.555853,665.0,1.021505,37622.000000,1140.060606,33.0,1.000000


This functionality has been added to src.FeatureGenerator

So we've created a dataset where for every month we have some descriptive features about sales and profits. We also have a target column that contains the number of items sold in the given shop. We can now train a model to predic number of items sold in the following month.

# Feature generation at inference time

Let's verify that we can create feature dataset to predict target for a given test.csv.

FeatureGenerator class have been created and moved to src. We've imported it above and will demonstrate usage for the cases of cross validation and inference for a provided test index backbone here.

In [2]:
# Here we are generating features dataset for all shops, itmes and months specified in COLS_MIN_MAX in settings.
feat_generator = FeatureGenerator(verbose=True)

# feature generation has been tested both for small number of unique shop_id and all values.
# To optimise the latter case, batch processing and offloading have been implemented.
test_features_df = feat_generator.generate_features()
test_features_df.head()

base feats done
6 batches
shifts done
rolls done
batch 1/6 done
------------------------------
shifts done
rolls done
batch 2/6 done
------------------------------
shifts done
rolls done
batch 3/6 done
------------------------------
shifts done
rolls done
batch 4/6 done
------------------------------
shifts done
rolls done
batch 5/6 done
------------------------------
shifts done
rolls done
batch 6/6 done
------------------------------
concatenating


Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,item_price_sum_per_shop_lag_1,item_price_mean_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_cnt_day_mean_per_shop_lag_1,item_price_sum_per_item_lag_1,item_price_mean_per_item_lag_1,...,item_cnt_day_mean_per_category_lag_12,item_price_sum_per_shop_category_lag_12,item_price_mean_per_shop_category_lag_12,item_cnt_day_sum_per_shop_category_lag_12,item_cnt_day_mean_per_shop_category_lag_12,target_lag_12,target_roll_mean_2,target_roll_mean_5,target_roll_mean_12,target
0,0,30,1,40,2546339.0,531.262024,5578.0,1.163781,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.0
1,0,31,1,37,2546339.0,531.262024,5578.0,1.163781,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0
2,0,32,0,40,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
3,0,32,1,40,2546339.0,531.262024,5578.0,1.163781,76074.828125,338.110352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
4,0,33,0,37,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


So, features for all specified shops, items and months are generated correctly. In this case the backbone for all combinations of those index columns is generated as part of the generator logic. Let's now use this functionality
to generate features for the provided test index backbone.

In [4]:
# we are told that the test month is November, so following the last one in the train set, so 34-th 
test_month_num = 34 

# test_backbone in this case is the provided test dataset itself
test_backbone = pd.read_csv(RAW_PATH + 'test.csv')
test_backbone['date_block_num'] = test_month_num
feat_generator = FeatureGenerator()
test_features_df = feat_generator.add_features_to_backbone(test_backbone=test_backbone, target_month=test_month_num)
test_features_df.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_category_id,item_price_sum_per_shop_lag_1,item_price_mean_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_cnt_day_mean_per_shop_lag_1,item_price_sum_per_item_lag_1,...,item_cnt_day_sum_per_category_lag_12,item_cnt_day_mean_per_category_lag_12,item_price_sum_per_shop_category_lag_12,item_price_mean_per_shop_category_lag_12,item_cnt_day_sum_per_shop_category_lag_12,item_cnt_day_mean_per_shop_category_lag_12,target_lag_12,target_roll_mean_2,target_roll_mean_5,target_roll_mean_12
0,0,5,5037,34,19.0,994646.666667,1030.721934,1052.0,1.090155,37475.0,...,6134.0,1.144403,120378.0,1695.464789,77.0,1.084507,1.0,0.5,1.2,1.0
1,1,5,5320,34,0.0,994646.666667,1030.721934,1052.0,1.090155,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,34,19.0,994646.666667,1030.721934,1052.0,1.090155,49159.0,...,6134.0,1.144403,120378.0,1695.464789,77.0,1.084507,0.0,2.0,1.4,0.833333
3,3,5,5232,34,23.0,994646.666667,1030.721934,1052.0,1.090155,35713.0,...,5275.0,1.099875,120435.5,1605.806667,79.0,1.053333,0.0,0.0,0.2,0.083333
4,4,5,5268,34,0.0,994646.666667,1030.721934,1052.0,1.090155,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# some lag feature cells are empty, this is because the original file does not contain any sales for 
# these items/shops combinations
merged_df[(merged_df['item_id']  == 5268) & (merged_df['shop_id']  == 5)]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id


# Cross validation

In here we will check if existing functionality is enough to do a round of crossvalidation.
Overall idea:
- data for all folds is generated using TestGenerator and FeatureGenerator classes
- data is split to train/test by order number of month using sklearn.TimeSeriesSplit

In [None]:
# TODO: consider introducing 1-2 months gap between train and test
# TODO: it should be possible to use sklearn cross validation functionality: 
# 1) build df with all features and offload it to data/processed 
# 2) build custom layer to sample from the dataset using sklearn.TimeSeriesSplit based on months

In [16]:
# using time series split for the months column
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
all_months = merged_df['date_block_num'].unique()
all_months = all_months[all_months > max(WINS+SHIFTS)] # leaving enough months for longest shift/window calculation
print(f'all months: {all_months}\n len: {len(all_months)}')

tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)
for i, (train_index, test_index) in enumerate(tscv.split(all_months)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

all months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33]
 len: 21
Fold 0:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10]
  Test:  index=[11 12]
Fold 1:
  Train: index=[ 2  3  4  5  6  7  8  9 10 11 12]
  Test:  index=[13 14]
Fold 2:
  Train: index=[ 4  5  6  7  8  9 10 11 12 13 14]
  Test:  index=[15 16]
Fold 3:
  Train: index=[ 6  7  8  9 10 11 12 13 14 15 16]
  Test:  index=[17 18]
Fold 4:
  Train: index=[ 8  9 10 11 12 13 14 15 16 17 18]
  Test:  index=[19 20]


Let's try the implemented functionality to run a simple time series cross validation. We will use a simple mean predictor as a model for now.

The run_cv function have been implemented and moved to src.utilities. </br>
The simple mean predictor model have been implemented and moved to src.ToyModel.

In [2]:
# initiating data generators
feat_generator = FeatureGenerator()

# initiating toy model
model = ToyModel()

# initiating cv splitter
tscv = TimeSeriesSplit(test_size = 1, max_train_size=16)

# generating all of the data which we will iterate over during CV
features_df = feat_generator.generate_features()

# creating col lists for training
cols_di={
    'index': feat_generator.index_cols,
    'target': feat_generator.target_col,
    'feats': feat_generator.shifted_cols + feat_generator.roll_cols
}

# iterating over CV folds
cv_res = run_cv(df=features_df, months_cv_split=tscv, model=model, cols_di=cols_di, verbose=2)

Fold 0:
  Train months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28], size: 120,664
  Test months: [29],   size: 7,019
  NRMSE:  1.0
  RMSE :  5.2

Fold 1:
  Train months: [14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29], size: 119,754
  Test months: [30],   size: 6,708
  NRMSE:  1.0
  RMSE :  4.5

Fold 2:
  Train months: [15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30], size: 118,591
  Test months: [31],   size: 5,715
  NRMSE:  1.0
  RMSE :  4.4

Fold 3:
  Train months: [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31], size: 116,872
  Test months: [32],   size: 5,341
  NRMSE:  1.0
  RMSE :  5.0

Fold 4:
  Train months: [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32], size: 114,549
  Test months: [33],   size: 5,677
  NRMSE:  1.0
  RMSE :  3.9


------------------------------
RMSE mean: 4.6
NRMSE mean: 1.0


We see that functionality built is sufficient and we can now go on to try more complex models.

In [3]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

model = LinearRegression()
# model = XGBRegressor()

# creating col lists for training
cols_di={
    'index': feat_generator.index_cols,
    'target': feat_generator.target_col,
    'feats': feat_generator.roll_cols + feat_generator.shifted_cols 
}

# iterating over CV folds
cv_res = run_cv(df=features_df, months_cv_split=tscv, model=model, cols_di=cols_di, verbose=2)

Fold 0:
  Train months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28], size: 120,664
  Test months: [29],   size: 7,019
  NRMSE:  0.51
  RMSE :  2.6

Fold 1:
  Train months: [14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29], size: 119,754
  Test months: [30],   size: 6,708
  NRMSE:  0.49
  RMSE :  2.2

Fold 2:
  Train months: [15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30], size: 118,591
  Test months: [31],   size: 5,715
  NRMSE:  0.4
  RMSE :  1.7

Fold 3:
  Train months: [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31], size: 116,872
  Test months: [32],   size: 5,341
  NRMSE:  0.69
  RMSE :  3.4

Fold 4:
  Train months: [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32], size: 114,549
  Test months: [33],   size: 5,677
  NRMSE:  0.73
  RMSE :  2.8


------------------------------
RMSE mean: 2.6
NRMSE mean: 0.57
