In [1]:
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
import numpy as np

from src.utilities import run_cv, generate_backbone
from src.ToyModel import ToyModel
from src.FeatureGenerator import FeatureGenerator
from src.settings import RAW_PATH, PROCESSED_PATH, SHIFTS, WINS, COLS_MIN_MAX, GROUP_COLS, ROLL_FUNCS

# Initial merge of `sales_train` and `items`

In [None]:
sales_train = pd.read_csv(RAW_PATH + 'sales_train.csv')
items = pd.read_csv(RAW_PATH + 'items.csv')[['item_id', 'item_category_id']]

In [None]:
sales_train.head()

In [None]:
items.head()

In [None]:
merged_df = sales_train.merge(items, how='left', on='item_id')

In [None]:
merged_df.isna().sum()

In [None]:
merged_df.to_parquet(PROCESSED_PATH + 'merged_train_df.parquet', index=False)

# Constructing target

This will be a simpler version of target where instead of rolling window of 30 days we will simply use the month id to sum aggregate number of items sold. We will calculate this for each shop and item, add missing months with 0 sales and shift the aggregate by 1 month.

This functionality has been moved to src.TestGenerator

## Building features

## Features describing sales for `shop_id` and `date_block_num`

For all feature construction we have to use expanded dataset to preserve the sequence of months. If some months are missing, than shifts and window aggregates would not be correct since we are not working with the datetime column here.

This functionality has been implemented and moved to src.FeatureGenerator

### Counts of deals per month and per shop, lags, rolling aggregates

This functionality has been implemented and moved to src.FeatureGenerator

### Aggregates over prices per month and per shop, lags, window aggregates

This functionality has been implemented and moved to src.FeatureGenerator

## Features describing sales for `shop_id`, `category_id` and `date_block_num`

In [None]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
local_df = merged_df[['date_block_num', 'item_id', 'shop_id', 'item_cnt_day', 'item_price']].reset_index(drop=True).copy()

index_cols = ['shop_id', 'item_id', 'date_block_num']
base_cols = ['item_price', 'item_cnt_day']
target_col = ['target']

res_df = generate_backbone()

In [None]:
# base aggregates
agg_di = {col: ROLL_FUNCS for col in base_cols}

for k, group in GROUP_COLS.items():
    agg_df = local_df.groupby(group, as_index = False).agg(agg_di)
    agg_df.columns = ['_'.join(col) + f'_per_{k}' if col[1] else col[0] for col in agg_df.columns ]
    res_df = res_df.merge(agg_df, how='left', on=group).fillna(0)

res_df = res_df.rename(columns={'item_cnt_day_sum_per_shop_item': 'target'})
base_feat_cols = [col for col in res_df if col not in index_cols + base_cols + target_col]

In [None]:
# lag aggregates
cols_to_shift = base_feat_cols + target_col
shifted_cols = []
for shift in SHIFTS:
    preshift_df = res_df[index_cols + cols_to_shift].copy()
    preshift_df['date_block_num'] = preshift_df['date_block_num'] + shift
    rename_dict = {col: f'{col}_lag_{shift}' for col in cols_to_shift}
    shifted_cols += [f'{col}_lag_{shift}' for col in cols_to_shift]
    preshift_df = preshift_df.rename(columns = rename_dict)
    
    res_df = res_df.merge(preshift_df, how='left', on=index_cols).fillna(0)

In [None]:
# rolling window aggregates
roll_cols = []
col = target_col[0]
for win_len in WINS:
    # groupping_k = col.split('_per_')[1] if not col == target_col[0] else 'shop_item'
    group = GROUP_COLS['shop_item']
    roll_df = res_df[group + [col]].drop_duplicates().sort_values(group)
    new_name = f'{col}_roll_mean_{win_len}'
    roll_df = roll_df.groupby(group[:-1], as_index=False)\
                        .rolling(win_len, on='date_block_num', closed='right')[col].mean().fillna(0).reset_index()\
                        .rename(columns={col: new_name})
    res_df = res_df.merge(roll_df, how='left', on=group)
    roll_cols.append(new_name)
    # print(roll_df.head())
    # print('-'*30)
    # tmp.groupby('shop_id', as_index=False).rolling(2, on='date_block_num')[base_feat_cols[0]].sum()

In [None]:
res_df.head()

This functionality has been moved to src.FeatureGenerator

This functionality has been implemented and moved to src.FeatureGenerator

## Features describing sales for `shop_id`, `item_id` and `date_block_num`

## Autoregression features

This functionality has been implemented and moved to src.FeatureGenerator

## Features describing sales of a given category in general and for a given shop

In [3]:
GROUP_COLS = {
    'category': ['item_category_id', 'date_block_num'],
    'shop_category': ['shop_id', 'item_category_id', 'date_block_num']
}

index_cols = ['shop_id', 'item_id', 'date_block_num']
base_cols = ['item_price', 'item_cnt_day']
target_col = ['target']
cat_col = ['item_category_id']

In [14]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
local_df = merged_df[['date_block_num', 'item_id', 'shop_id', 
                      'item_cnt_day', 'item_price', 'item_category_id']].reset_index(drop=True).copy()
item_cat_map = local_df[['item_id', 'item_category_id']].drop_duplicates()
res_df = generate_backbone().merge(item_cat_map, how='left')
agg_di = {col: ROLL_FUNCS for col in base_cols}

for k, group in GROUP_COLS.items():
    agg_df = local_df.groupby(group, as_index = False).agg(agg_di)
    agg_df.columns = ['_'.join(col) + f'_per_{k}' if col[1] else col[0] for col in agg_df.columns ]
    print(k)
    res_df = res_df.merge(agg_df, how='left', on=group).fillna(0)

# res_df = res_df.rename(columns={'item_cnt_day_sum_per_shop_item': 'target'})
base_feat_cols = [str(col) for col in res_df if col not in 
                  index_cols + base_cols + target_col + cat_col]

category
shop_category


In [15]:
base_feat_cols

['item_price_sum_per_category',
 'item_price_mean_per_category',
 'item_cnt_day_sum_per_category',
 'item_cnt_day_mean_per_category',
 'item_price_sum_per_shop_category',
 'item_price_mean_per_shop_category',
 'item_cnt_day_sum_per_shop_category',
 'item_cnt_day_mean_per_shop_category']

In [11]:
res_df

Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,item_price_sum_per_category,item_price_mean_per_category,item_cnt_day_sum_per_category,item_cnt_day_mean_per_category,item_price_sum_per_shop_category,item_price_mean_per_shop_category,item_cnt_day_sum_per_shop_category,item_cnt_day_mean_per_shop_category
0,26,0,0,40.0,7.145102e+06,244.485942,33489.0,1.145902,152869.000000,261.314530,607.0,1.037607
1,26,0,1,40.0,6.771906e+06,252.174936,31649.0,1.178558,151206.566667,258.915354,631.0,1.080479
2,26,0,2,40.0,8.023436e+06,260.645016,36277.0,1.178475,174743.266667,264.361977,725.0,1.096823
3,26,0,3,40.0,6.045864e+06,254.852406,27525.0,1.160266,156502.833333,266.161281,647.0,1.100340
4,26,0,4,40.0,5.554534e+06,247.627583,25470.0,1.135482,115066.833333,252.339547,488.0,1.070175
...,...,...,...,...,...,...,...,...,...,...,...,...
2261335,28,22169,29,69.0,5.913720e+05,938.685683,660.0,1.047619,28744.000000,1105.538462,27.0,1.038462
2261336,28,22169,30,69.0,7.748948e+05,738.698594,1124.0,1.071497,34694.150000,963.726389,36.0,1.000000
2261337,28,22169,31,69.0,7.737686e+05,888.368037,901.0,1.034443,27395.000000,1095.800000,25.0,1.000000
2261338,28,22169,32,69.0,6.168589e+05,947.555853,665.0,1.021505,37622.000000,1140.060606,33.0,1.000000


This functionality has been added to src.FeatureGenerator

So we've created a dataset where for every month we have some descriptive features about sales and profits. We also have a target column that contains the number of items sold in the given shop. We can now train a model to predic number of items sold in the following month.

# Feature generation at inference time

Let's verify that we can create feature dataset to predict target for a given test.csv.

FeatureGenerator and TestGenerator classes have been created and moved to src. We've imported it above and will demonstrate usage for the cases of crossvalidation and inference for a provided test index backbone here.

In [2]:
feat_generator = FeatureGenerator(target_months=[11])

# suppose we want to make 11 the target month, i.e. the month we predict to
test_features_df = feat_generator.generate_features()
test_features_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,item_price_sum_per_shop_lag_1,item_price_mean_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_cnt_day_mean_per_shop_lag_1,item_price_sum_per_item_lag_1,item_price_mean_per_item_lag_1,...,item_cnt_day_mean_per_category_lag_12,item_price_sum_per_shop_category_lag_12,item_price_mean_per_shop_category_lag_12,item_cnt_day_sum_per_shop_category_lag_12,item_cnt_day_mean_per_shop_category_lag_12,target_lag_12,target_roll_mean_2,target_roll_mean_5,target_roll_mean_12,target
0,26,32,11,40.0,1433133.0,820.809499,2409.0,1.379725,11401.0,144.316456,...,0.0,0.0,0.0,0.0,0.0,0.0,1.5,1.4,0.0,1.0
1,26,33,11,37.0,1433133.0,820.809499,2409.0,1.379725,2923.0,194.866667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.4,0.0,3.0
2,26,66,11,37.0,1433133.0,820.809499,2409.0,1.379725,399.0,399.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,26,156,11,45.0,1433133.0,820.809499,2409.0,1.379725,2093.0,299.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,1.0
4,26,226,11,45.0,1433133.0,820.809499,2409.0,1.379725,1245.0,249.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


So, features for a generated shop-item backbone and a given month are generated correctly. Let's now use this functionality
to generate features for the provided test index backbone.

In [2]:
# we are told that the test month is November, so following the last one in the train set, so 34-th 
test_month_num = 34 

# test_backbone in this case is the provided test dataset itself
test_backbone = pd.read_csv(RAW_PATH + 'test.csv')
test_backbone['date_block_num'] = 34
feat_generator = FeatureGenerator()
test_features_df = feat_generator.add_features_to_backbone(test_backbone=test_backbone, target_month=test_month_num)
# test_features_df.head()

{'shop_id': (2, 59), 'item_id': (30, 22167), 'date_block_num': (22, 34)}
   shop_id  item_id  date_block_num  item_category_id
0        2       30              22              40.0
1        2       30              23              40.0
2        2       30              24              40.0
3        2       30              25              40.0
4        2       30              26              40.0
------------------------------
   shop_id  item_id  date_block_num  item_category_id   
0        2       30              22              40.0  \
1        2       30              23              40.0   
2        2       30              24              40.0   
3        2       30              25              40.0   
4        2       30              26              40.0   

   item_price_sum_per_shop  item_price_mean_per_shop   
0             1.434123e+06               1673.422631  \
1             2.704724e+06               1773.589416   
2             1.329241e+06               1526.108863   
3    

In [3]:
test_features_df[test_features_df['item_id'] == 5037]

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_category_id,item_price_sum_per_shop_lag_1,item_price_mean_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_cnt_day_mean_per_shop_lag_1,item_price_sum_per_item_lag_1,...,item_cnt_day_mean_per_category_lag_12,item_price_sum_per_shop_category_lag_12,item_price_mean_per_shop_category_lag_12,item_cnt_day_sum_per_shop_category_lag_12,item_cnt_day_mean_per_shop_category_lag_12,target_lag_12,target_roll_mean_2,target_roll_mean_5,target_roll_mean_12,target
0,0,5,5037,34,19.0,994646.7,1030.721934,1052.0,1.090155,37475.0,...,1.144403,120378.0,1695.464789,77.0,1.084507,1.0,0.5,1.2,1.0,0.0
5100,5100,4,5037,34,19.0,685485.7,972.320189,831.0,1.178723,37475.0,...,1.144403,107827.0,1711.539683,64.0,1.015873,0.0,0.0,0.6,0.416667,0.0
10200,10200,6,5037,34,19.0,2012338.0,1314.39428,1802.0,1.177008,37475.0,...,1.144403,243949.0,1951.592,150.0,1.2,4.0,0.5,0.6,1.25,0.0
15300,15300,3,5037,34,19.0,734662.0,1343.074954,613.0,1.120658,37475.0,...,1.144403,149765.925,1682.763202,95.0,1.067416,2.0,0.0,0.6,1.0,0.0
20400,20400,2,5037,34,19.0,1072151.0,1670.016874,727.0,1.132399,37475.0,...,1.144403,202695.3,1762.567826,126.0,1.095652,0.0,0.5,0.8,1.0,0.0
25500,25500,7,5037,34,19.0,1416323.0,1323.666296,1212.0,1.13271,37475.0,...,1.144403,269762.0,1822.716216,155.0,1.047297,2.0,2.5,2.6,2.0,0.0
30600,30600,10,5037,34,19.0,422205.7,1077.055357,428.0,1.091837,37475.0,...,1.144403,100520.07,1733.104655,62.0,1.068966,1.0,0.0,0.2,0.333333,0.0
35700,35700,12,5037,34,19.0,1592887.0,1490.071772,4181.0,3.911132,37475.0,...,1.144403,160656.785714,1935.623924,142.0,1.710843,0.0,-0.5,0.6,0.666667,0.0
40800,40800,28,5037,34,19.0,2866024.0,1140.933205,3018.0,1.201433,37475.0,...,1.144403,502857.6,1728.03299,352.0,1.209622,1.0,5.0,3.6,3.666667,0.0
45900,45900,31,5037,34,19.0,4686812.0,964.960253,6112.0,1.25839,37475.0,...,1.144403,525415.833333,1751.386111,395.0,1.316667,2.0,12.0,7.4,5.833333,0.0


In [14]:
merged_df[merged_df['item_id'] == 5037]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
1953995,21.09.2014,20,5,5037,2599.0,1.0,19
1956692,12.09.2014,20,4,5037,2599.0,1.0,19
1956970,12.09.2014,20,3,5037,2599.0,1.0,19
1956971,14.09.2014,20,3,5037,2599.0,1.0,19
1956972,18.09.2014,20,3,5037,2599.0,1.0,19
...,...,...,...,...,...,...,...
2925201,31.10.2015,33,26,5037,1499.0,1.0,19
2925689,06.10.2015,33,31,5037,1499.0,1.0,19
2929098,11.10.2015,33,24,5037,1499.0,1.0,19
2931908,29.10.2015,33,21,5037,1499.0,1.0,19


In [26]:
def construct_cols_min_max(dfs: list[pd.DataFrame], cols: list[str]) -> dict:
    return {col: (min([el_df[col].min() for el_df in dfs]),
                  max([el_df[col].max() for el_df in dfs])) for col in cols}

construct_cols_min_max(dfs=[test_backbone, merged_df], 
                       cols=['shop_id', 'item_id', 'date_block_num'])

{'shop_id': (0, 59), 'item_id': (0, 22169), 'date_block_num': (0, 34)}

In [21]:
test_backbone

Unnamed: 0,ID,shop_id,item_id,date_block_num
0,0,5,5037,34
1,1,5,5320,34
2,2,5,5233,34
3,3,5,5232,34
4,4,5,5268,34
...,...,...,...,...
214195,214195,45,18454,34
214196,214196,45,16188,34
214197,214197,45,15757,34
214198,214198,45,19648,34


In [9]:
train_shop_range = range(COLS_MIN_MAX['shop_id'][0], COLS_MIN_MAX['shop_id'][1]+1)
test_features_df[test_features_df['shop_id'].isin(train_shop_range)]

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_category_id,item_price_sum_per_shop_lag_1,item_price_mean_per_shop_lag_1,item_cnt_day_sum_per_shop_lag_1,item_cnt_day_mean_per_shop_lag_1,item_price_sum_per_item_lag_1,...,item_cnt_day_mean_per_category_lag_12,item_price_sum_per_shop_category_lag_12,item_price_mean_per_shop_category_lag_12,item_cnt_day_sum_per_shop_category_lag_12,item_cnt_day_mean_per_shop_category_lag_12,target_lag_12,target_roll_mean_2,target_roll_mean_5,target_roll_mean_12,target
40800,40800,28,5037,34,19.0,2.866024e+06,1140.933205,3018.0,1.201433,37475.0,...,1.144403,502857.600000,1728.032990,352.0,1.209622,1.0,5.0,3.6,3.666667,0.0
40801,40801,28,5320,34,0.0,2.866024e+06,1140.933205,3018.0,1.201433,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
40802,40802,28,5233,34,19.0,2.866024e+06,1140.933205,3018.0,1.201433,49159.0,...,1.144403,502857.600000,1728.032990,352.0,1.209622,0.0,5.0,4.2,2.000000,0.0
40803,40803,28,5232,34,23.0,2.866024e+06,1140.933205,3018.0,1.201433,35713.0,...,1.099875,472590.100000,1559.703300,360.0,1.188119,0.0,3.5,1.4,0.583333,0.0
40804,40804,28,5268,34,0.0,2.866024e+06,1140.933205,3018.0,1.201433,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56095,56095,26,18454,34,55.0,1.413624e+06,1259.914530,1409.0,1.255793,198.0,...,1.041406,41979.700000,310.960741,137.0,1.014815,0.0,0.0,0.0,0.416667,0.0
56096,56096,26,16188,34,64.0,1.413624e+06,1259.914530,1409.0,1.255793,1359.0,...,1.035102,11077.000000,852.076923,14.0,1.076923,0.0,0.0,0.0,0.000000,0.0
56097,56097,26,15757,34,55.0,1.413624e+06,1259.914530,1409.0,1.255793,1145.0,...,1.041406,41979.700000,310.960741,137.0,1.014815,0.0,0.0,0.0,0.083333,0.0
56098,56098,26,19648,34,40.0,1.413624e+06,1259.914530,1409.0,1.255793,178.2,...,1.090300,58121.966667,302.718576,199.0,1.036458,0.0,0.0,0.2,0.250000,0.0


# Cross validation

In here we will check if existing functionality is enough to do a round of crossvalidation.
Overall idea:
- data for all folds is generated using TestGenerator and FeatureGenerator classes
- data is split to train/test by order number of month using sklearn.TimeSeriesSplit

In [None]:
# TODO: consider introducing 1-2 months gap between train and test
# TODO: it should be possible to use sklearn cross validation functionality: 
# 1) build df with all features and offload it to data/processed 
# 2) build custom layer to sample from the dataset using sklearn.TimeSeriesSplit based on months

In [None]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
all_months = merged_df['date_block_num'].unique()
all_months = all_months[all_months > max(WINS+SHIFTS)] # leaving enough months for longest shift/window calculation
print(f'all months: {all_months}\n len: {len(all_months)}')

tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)
for i, (train_index, test_index) in enumerate(tscv.split(all_months)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

Let's try the implemented functionality to run a simple time series cross validation. We will use a simple mean predictor as a model for now.

In [None]:
# initiating data generators
feat_generator = FeatureGenerator()

# initiating toy model
model = ToyModel()

# initiating cv splitter
tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)

# generating all of the data which we will iterate over during CV
features_df = feat_generator.generate_features()

# creating col lists for training
cols_di={
    'index': feat_generator.index_cols,
    'target': feat_generator.target_col,
    'feats': feat_generator.shifted_cols + feat_generator.roll_cols
}


# iterating over CV folds
cv_res = run_cv(df=features_df, months_cv_split=tscv, model=model, cols_di=cols_di, verbose=2)

We see that functionality built is sufficient and we can now go on to try more complex models.

In [None]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

model = LinearRegression()
# model = XGBRegressor()

# creating col lists for training
cols_di={
    'index': feat_generator.index_cols,
    'target': feat_generator.target_col,
    'feats': feat_generator.roll_cols + feat_generator.shifted_cols 
}

# iterating over CV folds
cv_res = run_cv(df=features_df, months_cv_split=tscv, model=model, cols_di=cols_di, verbose=2)