In [52]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import *
from itertools import product
import pandas as pd
import numpy as np

from src.FeatureGenerator import *
from src.TestGenerator import *
from src.settings import RAW_PATH, PROCESSED_PATH, WINS_SHIFTS, ROLL_FUNCS, COLS_MIN_MAX #SHOP_ID_MIN_MAX, ITEM_ID_MIN_MAX

# Initial merge of `sales_train` and `items`

In [23]:
sales_train = pd.read_csv(RAW_PATH + 'sales_train.csv')
items = pd.read_csv(RAW_PATH + 'items.csv')[['item_id', 'item_category_id']]

In [24]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [25]:
items.head()

Unnamed: 0,item_id,item_category_id
0,0,40
1,1,76
2,2,40
3,3,40
4,4,40


In [26]:
merged_df = sales_train.merge(items, how='left', on='item_id')

In [27]:
merged_df.isna().sum()

date                0
date_block_num      0
shop_id             0
item_id             0
item_price          0
item_cnt_day        0
item_category_id    0
dtype: int64

In [29]:
merged_df.to_parquet(PROCESSED_PATH + 'merged_train_df.parquet', index=False)

# Constructing target

This will be a simpler version of target where instead of rolling window of 30 days we will simply use the month id to sum aggregate number of items sold. We will calculate this for each shop and item, add missing months with 0 sales and shift the aggregate by 1 month.

In [261]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')

In [30]:
# creating groupping for particular month, shop and item
grouping_cols = ['shop_id', 'item_id', 'date_block_num']
target_df = merged_df[grouping_cols + ['item_cnt_day']].sort_values(grouping_cols)
target_df = target_df.groupby(grouping_cols)['item_cnt_day'].sum().reset_index() 

In [31]:
# creating dataframe where for each combination of shop and item every month is present
index_backbone = pd.DataFrame(product(
    range(target_df['shop_id'].min(), target_df['shop_id'].max()+1),
    range(target_df['item_id'].min(), target_df['item_id'].max()+1),
    range(target_df['date_block_num'].min(), target_df['date_block_num'].max()+1)
), columns = ['shop_id', 'item_id', 'date_block_num'])

In [32]:
extended_target_df = index_backbone.merge(target_df, how='left', on=grouping_cols).fillna(0)
extended_target_df = extended_target_df.sort_values(grouping_cols)

In [33]:
# grouping by shop_id and item_id and shifting by 1 row "into the future"
extended_target_df['target'] = extended_target_df.groupby(grouping_cols[:-1])['item_cnt_day'].shift(-1)

# leaving only rows with deals in current month as was in the dataset before expansion
shrinked_target_df = extended_target_df[extended_target_df['item_cnt_day'] > 0].reset_index(drop=True).fillna(0)
shrinked_target_df.head()

# target_df = target_df[((target_df['item_id']==5822) & (target_df['shop_id'] == 2) & \
#                        (target_df['date_block_num'] < 6)) | ((target_df['item_id']==100) & \
#                        (target_df['shop_id'] == 57))]

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day,target
0,0,30,1,31.0,0.0
1,0,31,1,11.0,0.0
2,0,32,0,6.0,10.0
3,0,32,1,10.0,0.0
4,0,33,0,3.0,3.0


In [55]:
shrinked_target_df.shape

(1605626, 5)

In [10]:
shrinked_target_df.to_parquet(PROCESSED_PATH + 'target_df.parquet', index=False)

## Building features

## Features describing sales for `shop_id` and `date_block_num`

In [30]:
list(merged_df)

['date',
 'date_block_num',
 'shop_id',
 'item_id',
 'item_price',
 'item_cnt_day',
 'item_category_id']

For all feature construction we have to use expanded dataset as was built in the previous section to preserve the sequence of months. If some months are missing, than shifts and window aggregates would not be correct since we are not working with the datetime column here.

In [26]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')

# creating dataframe where for each combination of shop and item every month is present
shop_month_index_backbone = pd.DataFrame(product(
    range(merged_df['shop_id'].min(), merged_df['shop_id'].max()+1),
    range(merged_df['date_block_num'].min(), merged_df['date_block_num'].max()+1)
), columns = ['shop_id', 'date_block_num'])

### Counts of deals per month and per shop, lags, rolling aggregates

In [27]:
# adding deals count column
deals_cnt_df = merged_df[['date_block_num', 'shop_id', 'item_id']].reset_index(drop=True).copy()
group_cols = ['shop_id', 'date_block_num']
deals_cnt_df = deals_cnt_df.sort_values(group_cols).groupby(group_cols)\
        ['item_id'].count().reset_index().rename(columns={'item_id': 'deals_cnt'})
deals_cnt_df = shop_month_index_backbone.merge(deals_cnt_df, how='left', on=group_cols).fillna(0)

# calculating lags
deals_cnt_df = deals_cnt_df.set_index('shop_id')
for shift in WINS_SHIFTS:
    deals_cnt_df[f'deals_cnt_shift_{shift}'] = deals_cnt_df.groupby('shop_id')['deals_cnt'].shift(periods=shift, fill_value=0)
deals_cnt_df = deals_cnt_df.reset_index()

# calculating rolling window aggregates
deals_cnt_df = deals_cnt_df.sort_values(group_cols)
roll_funcs = ROLL_FUNCS
for func in roll_funcs:
    for win_len in WINS_SHIFTS:
        deals_cnt_df[f'deals_cnt_roll_{func}_{win_len}'] = deals_cnt_df.groupby('shop_id').rolling(win_len, min_periods=1)\
                .agg({'deals_cnt': func}).reset_index(drop=True).fillna(0)

In [6]:
deals_cnt_df

Unnamed: 0,shop_id,date_block_num,deals_cnt,deals_cnt_shift_2,deals_cnt_shift_6,deals_cnt_shift_12,deals_cnt_roll_sum_2,deals_cnt_roll_sum_6,deals_cnt_roll_sum_12,deals_cnt_roll_mean_2,...,deals_cnt_roll_mean_12,deals_cnt_roll_std_2,deals_cnt_roll_std_6,deals_cnt_roll_std_12,deals_cnt_roll_min_2,deals_cnt_roll_min_6,deals_cnt_roll_min_12,deals_cnt_roll_max_2,deals_cnt_roll_max_6,deals_cnt_roll_max_12
0,0,0,4793.0,0.0,0.0,0.0,4793.0,4793.0,4793.0,4793.0,...,4793.000000,0.000000,0.000000,0.000000,4793.0,4793.0,4793.0,4793.0,4793.0,4793.0
1,0,1,5064.0,0.0,0.0,0.0,9857.0,9857.0,9857.0,4928.5,...,4928.500000,191.625938,191.625938,191.625938,4793.0,4793.0,4793.0,5064.0,5064.0,5064.0
2,0,2,0.0,4793.0,0.0,0.0,5064.0,9857.0,9857.0,2532.0,...,3285.666667,3580.788740,2848.695198,2848.695198,0.0,0.0,0.0,5064.0,5064.0,5064.0
3,0,3,0.0,5064.0,0.0,0.0,0.0,9857.0,9857.0,0.0,...,2464.250000,0.000000,2847.620805,2847.620805,0.0,0.0,0.0,0.0,5064.0,5064.0
4,0,4,0.0,0.0,0.0,0.0,0.0,9857.0,9857.0,0.0,...,1971.400000,0.000000,2701.150459,2701.150459,0.0,0.0,0.0,0.0,5064.0,5064.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2035,59,29,836.0,792.0,1651.0,1013.0,1585.0,5427.0,12409.0,792.5,...,1034.083333,61.518290,133.824886,240.913964,749.0,749.0,749.0,836.0,1091.0,1651.0
2036,59,30,916.0,749.0,1091.0,974.0,1752.0,5252.0,12351.0,876.0,...,1029.250000,56.568542,99.791115,242.803372,836.0,749.0,749.0,916.0,1011.0,1651.0
2037,59,31,973.0,836.0,948.0,1127.0,1889.0,5277.0,12197.0,944.5,...,1016.416667,40.305087,103.870593,241.231861,916.0,749.0,749.0,973.0,1011.0,1651.0
2038,59,32,778.0,916.0,1011.0,941.0,1751.0,5044.0,12034.0,875.5,...,1002.833333,137.885822,87.071618,250.283778,778.0,749.0,749.0,973.0,973.0,1651.0


In [77]:
deals_cnt_df[((deals_cnt_df['shop_id'] == 9)) ]

Unnamed: 0,shop_id,date_block_num,deals_cnt
306,9,0,0.0
307,9,1,0.0
308,9,2,0.0
309,9,3,0.0
310,9,4,0.0
311,9,5,0.0
312,9,6,0.0
313,9,7,0.0
314,9,8,0.0
315,9,9,1488.0


In [539]:
deals_cnt_df[deals_cnt_df['shop_id']==59].head()

Unnamed: 0,shop_id,date_block_num,deals_cnt,deals_cnt_shift_1,deals_cnt_shift_3,deals_cnt_shift_7,deals_cnt_shift_12,deals_cnt_shift_20,deals_cnt_sum_3,deals_cnt_sum_6,...,deals_cnt_min_3,deals_cnt_min_6,deals_cnt_min_9,deals_cnt_min_15,deals_cnt_min_20,deals_cnt_max_3,deals_cnt_max_6,deals_cnt_max_9,deals_cnt_max_15,deals_cnt_max_20
1552,59,0,1847,0,0,0,0,0,1847.0,1847.0,...,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0
1553,59,1,1696,1847,0,0,0,0,3543.0,3543.0,...,1696.0,1696.0,1696.0,1696.0,1696.0,1847.0,1847.0,1847.0,1847.0,1847.0
1554,59,2,1743,1696,0,0,0,0,5286.0,5286.0,...,1696.0,1696.0,1696.0,1696.0,1696.0,1847.0,1847.0,1847.0,1847.0,1847.0
1555,59,3,1271,1743,1847,0,0,0,4710.0,6557.0,...,1271.0,1271.0,1271.0,1271.0,1271.0,1743.0,1847.0,1847.0,1847.0,1847.0
1556,59,4,1194,1271,1696,0,0,0,4208.0,7751.0,...,1194.0,1194.0,1194.0,1194.0,1194.0,1743.0,1847.0,1847.0,1847.0,1847.0


In [78]:
deals_cnt_df.shape

(2040, 3)

### Aggregates over prices per month and per shop, lags, window aggregates

In [9]:
# adding simple aggregates of prices over various deals
prices_df = merged_df.reset_index()[['shop_id', 'date_block_num', 'item_price']]
group_cols = ['shop_id', 'date_block_num']
simple_agg_funcs = ROLL_FUNCS
prices_df = prices_df.groupby(group_cols).agg({'item_price': simple_agg_funcs}).fillna(0)
prices_df.columns = ['_'.join(col) for col in prices_df.columns]
prices_df = prices_df.reset_index()
prices_df = shop_month_index_backbone.merge(prices_df, how='left').fillna(0)
simple_agg_cols = [f'item_price_{agg}' for agg in simple_agg_funcs]

# adding lags
prices_df = prices_df.sort_values(group_cols).set_index('shop_id')
for shift in WINS_SHIFTS:
    for col in simple_agg_cols:
        prices_df[f'{col}_shift_{shift}'] = prices_df.groupby('shop_id')[col].shift(periods=shift, fill_value=0)
prices_df = prices_df.reset_index()

# adding window aggregates
prices_df = prices_df.sort_values(group_cols)
roll_funcs = ROLL_FUNCS
cols_to_agg = ['item_price_sum', 'item_price_mean']

for func in roll_funcs:
    for win_len in WINS_SHIFTS:
        for col in cols_to_agg:
            prices_df[f'{col}_roll_{func}_{win_len}'] = prices_df.groupby('shop_id').rolling(win_len, min_periods=1)\
                    .agg({col: func}).reset_index(drop=True).fillna(0)

## Features describing sales for `shop_id`, `category_id` abd `date_block_num`

## Feature describing sales for `shop_id`, `item_id` and `date_block_num`

## Autoregression features

# Final merge

We are left joining all the feature datasets to the target dataset here

In [34]:
shrinked_target_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day,target
0,0,30,1,31.0,0.0
1,0,31,1,11.0,0.0
2,0,32,0,6.0,10.0
3,0,32,1,10.0,0.0
4,0,33,0,3.0,3.0


In [35]:
deals_cnt_df.head()


Unnamed: 0,shop_id,date_block_num,deals_cnt,deals_cnt_shift_2,deals_cnt_shift_6,deals_cnt_shift_12,deals_cnt_roll_sum_2,deals_cnt_roll_sum_6,deals_cnt_roll_sum_12,deals_cnt_roll_mean_2,...,deals_cnt_roll_mean_12,deals_cnt_roll_std_2,deals_cnt_roll_std_6,deals_cnt_roll_std_12,deals_cnt_roll_min_2,deals_cnt_roll_min_6,deals_cnt_roll_min_12,deals_cnt_roll_max_2,deals_cnt_roll_max_6,deals_cnt_roll_max_12
0,0,0,4793.0,0.0,0.0,0.0,4793.0,4793.0,4793.0,4793.0,...,4793.0,0.0,0.0,0.0,4793.0,4793.0,4793.0,4793.0,4793.0,4793.0
1,0,1,5064.0,0.0,0.0,0.0,9857.0,9857.0,9857.0,4928.5,...,4928.5,191.625938,191.625938,191.625938,4793.0,4793.0,4793.0,5064.0,5064.0,5064.0
2,0,2,0.0,4793.0,0.0,0.0,5064.0,9857.0,9857.0,2532.0,...,3285.666667,3580.78874,2848.695198,2848.695198,0.0,0.0,0.0,5064.0,5064.0,5064.0
3,0,3,0.0,5064.0,0.0,0.0,0.0,9857.0,9857.0,0.0,...,2464.25,0.0,2847.620805,2847.620805,0.0,0.0,0.0,0.0,5064.0,5064.0
4,0,4,0.0,0.0,0.0,0.0,0.0,9857.0,9857.0,0.0,...,1971.4,0.0,2701.150459,2701.150459,0.0,0.0,0.0,0.0,5064.0,5064.0


In [36]:
prices_df.head()

NameError: name 'prices_df' is not defined

In [None]:
fin_df = shrinked_target_df.merge(deals_cnt_df, how='left').merge(prices_df, how='left')

In [None]:
fin_df = fin_df.astype('float32')

In [22]:
fin_df.to_parquet(PROCESSED_PATH+'fin_training_df.parquet')

So we've created a dataset where for every month we have some descriptive features about sales and profits. We also have a target column that contains the number of items sold in the given shop. We can now train a model to predic number of items sold in the following month.

# Feature generation at inference time

Let's verify that we can create feature dataset to predict target for a given test.csv.

FeatureGenerator and TestGenerator classes have been created and moved to src. We've imported it above and will demonstrate usage for the cases of crossvalidation and inference for a provided test index backbone here.

In [2]:
feat_generator = FeatureGenerator()
test_generator = TestGenerator()
shop_item_backbone_back = test_generator.generate_shop_item_backbone(test_size=1000)
backbone = test_generator.add_month_to_backbone(shop_item_backbone = shop_item_backbone_back, 
                                  month_num=11)
test_features_df = test_generator.add_features_to_backbone(test_backbone=backbone, 
                                            feat_generator=feat_generator)
test_features_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,deals_cnt,deals_cnt_shift_2,deals_cnt_shift_6,deals_cnt_shift_12,deals_cnt_roll_sum_2,deals_cnt_roll_sum_6,deals_cnt_roll_sum_12,...,item_price_sum_roll_min_6,item_price_mean_roll_min_6,item_price_sum_roll_min_12,item_price_mean_roll_min_12,item_price_sum_roll_max_2,item_price_mean_roll_max_2,item_price_sum_roll_max_6,item_price_mean_roll_max_6,item_price_sum_roll_max_12,item_price_mean_roll_max_12
0,7,4070,10,1960.0,2050.0,1539.0,0.0,3831.0,11750.0,21427.0,...,1208660.0,653.329888,947112.4,637.356965,1805427.0,921.136026,1805427.0,921.136026,1805427.0,921.136026
1,4,12143,10,1209.0,1239.0,1113.0,0.0,2450.0,7896.0,14785.0,...,935770.9,688.788252,182636.5,688.788252,1110282.0,918.34709,1110282.0,918.34709,1365703.0,918.34709
2,18,12077,10,1674.0,1769.0,2470.0,0.0,3229.0,11269.0,24001.0,...,1310901.0,877.704487,1310901.0,877.704487,2130369.0,1272.622116,2174952.0,1272.622116,2513204.0,1272.622116
3,48,19324,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,23,20094,10,0.0,0.0,0.0,0.0,0.0,0.0,6963.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1395718.0,760.609444


So, features for a generated shop-item backbone and a given month are generated correctly. Let's now use this functionality could be used to generate features for the provided test index backbone.

In [3]:
# we are told that the test month is November, so following the last one in the train set, so 34-th 
test_month_num = 34 

# test_backbone in this case is the provided test dataset itself
test_backbone = pd.read_csv(RAW_PATH + 'test.csv')

feat_generator = FeatureGenerator()
test_features_df = test_generator.add_features_to_backbone(test_backbone=test_backbone, 
                                            feat_generator=feat_generator)

In [4]:
test_features_df.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,deals_cnt,deals_cnt_shift_2,deals_cnt_shift_6,deals_cnt_shift_12,deals_cnt_roll_sum_2,deals_cnt_roll_sum_6,...,item_price_sum_roll_min_6,item_price_mean_roll_min_6,item_price_sum_roll_min_12,item_price_mean_roll_min_12,item_price_sum_roll_max_2,item_price_mean_roll_max_2,item_price_sum_roll_max_6,item_price_mean_roll_max_6,item_price_sum_roll_max_12,item_price_mean_roll_max_12
0,0,5,5037,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,5,5037,1,845.0,0.0,0.0,0.0,845.0,845.0,...,0.0,0.0,0.0,0.0,499735.96,591.403503,499735.96,591.403503,499735.96,591.403503
2,0,5,5037,2,1262.0,0.0,0.0,0.0,2107.0,2107.0,...,0.0,0.0,0.0,0.0,756773.413333,599.661976,756773.413333,599.661976,756773.413333,599.661976
3,0,5,5037,3,946.0,845.0,0.0,0.0,2208.0,3053.0,...,0.0,0.0,0.0,0.0,756773.413333,644.232294,756773.413333,644.232294,756773.413333,644.232294
4,0,5,5037,4,1060.0,1262.0,0.0,0.0,2006.0,4113.0,...,0.0,0.0,0.0,0.0,609443.75,644.232294,756773.413333,644.232294,756773.413333,644.232294


# Cross validation pathfinder

In here we will check if existing functionality is enough to do a round of crossvalidation.
Overall idea:
- will split to train/test by order number of month using sklearn.TimeSeriesSplit
- data for test is generated using TestGenerator functionality
- features for train are generated using FeatureGenerator class, target is calculated using TestGenerator class

In [53]:
#TODO consider introducing 1-2 months gap between train and test

In [61]:
class ToyModel():
    """Baseline model that predicts mean of the passed y vector at train time"""

    def fit(self, x: pd.DataFrame, y: pd.DataFrame):
        self.y_mean = y.mean().values[0]

    def predict(self, x):
        return np.array([self.y_mean] * x.shape[0])

In [15]:
merged_df = pd.read_parquet(PROCESSED_PATH + 'merged_train_df.parquet')
all_months = merged_df['date_block_num'].unique()
all_months = all_months[all_months > max(WINS_SHIFTS)] # leaving enough months for longest shift/window calculation
print(f'all months: {all_months}\n len: {len(all_months)}')

tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)
for i, (train_index, test_index) in enumerate(tscv.split(all_months)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

all months: [13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33]
 len: 21
Fold 0:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10]
  Test:  index=[11 12]
Fold 1:
  Train: index=[ 2  3  4  5  6  7  8  9 10 11 12]
  Test:  index=[13 14]
Fold 2:
  Train: index=[ 4  5  6  7  8  9 10 11 12 13 14]
  Test:  index=[15 16]
Fold 3:
  Train: index=[ 6  7  8  9 10 11 12 13 14 15 16]
  Test:  index=[17 18]
Fold 4:
  Train: index=[ 8  9 10 11 12 13 14 15 16 17 18]
  Test:  index=[19 20]


In [63]:
feat_generator = FeatureGenerator()
test_generator = TestGenerator(train=True)

# generating all of the data which we will iterate over during CV
target_df = test_generator.generate_target_for_month(list(range(13, 34)))
features_df = test_generator.add_features_to_backbone(test_backbone=target_df, 
                                                      feat_generator=feat_generator)

features_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,sum_sales_cnt,target,deals_cnt,deals_cnt_shift_2,deals_cnt_shift_6,deals_cnt_shift_12,deals_cnt_roll_sum_2,...,item_price_sum_roll_min_6,item_price_mean_roll_min_6,item_price_sum_roll_min_12,item_price_mean_roll_min_12,item_price_sum_roll_max_2,item_price_mean_roll_max_2,item_price_sum_roll_max_6,item_price_mean_roll_max_6,item_price_sum_roll_max_12,item_price_mean_roll_max_12
0,2,27,16,0.0,1.0,713.0,830.0,654.0,531.0,1431.0,...,986580.493219,1208.397169,712757.033333,950.342711,986835.2,1383.703357,2011789.0,1729.827302,2011789.0,1729.827302
1,2,30,14,0.0,1.0,830.0,830.0,735.0,681.0,1612.0,...,862041.86,1208.397169,518625.1,950.342711,1365037.0,1644.622662,2011789.0,1729.827302,2011789.0,1729.827302
2,2,30,15,1.0,1.0,718.0,782.0,634.0,556.0,1548.0,...,883355.333333,1208.397169,518625.1,950.342711,1365037.0,1644.622662,2011789.0,1729.827302,2011789.0,1729.827302
3,2,31,15,0.0,1.0,718.0,782.0,634.0,556.0,1548.0,...,883355.333333,1208.397169,518625.1,950.342711,1365037.0,1644.622662,2011789.0,1729.827302,2011789.0,1729.827302
4,2,31,32,0.0,1.0,675.0,740.0,639.0,709.0,1520.0,...,880431.375952,1041.930622,840096.731111,1041.930622,1001152.0,1483.187718,1001152.0,1483.187718,2704724.0,1773.589416


In [66]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
target_col = ['target']
feat_cols = [col for col in list(features_df) if col not in index_cols+target_col]

tscv = TimeSeriesSplit(test_size = 2, max_train_size=11)
for i, (train_index, test_index) in enumerate(tscv.split(all_months)):
    train_months = all_months[train_index]
    test_months = all_months[test_index]
    
    print(f"Fold {i}:")
    print(f"  Train: index={all_months[train_index]}")
    print(f"  Test:  index={all_months[test_index]}")

    train_df = features_df[features_df['date_block_num'].isin(train_months)]
    test_df = features_df[features_df['date_block_num'].isin(test_months)]

    toy = ToyModel()
    toy.fit(x=train_df[feat_cols], y=train_df[target_col])
    y_true = test_df[target_col].values
    y_pred = toy.predict(x=test_df[feat_cols])
    rmse = mean_squared_error(y_true=y_true, y_pred=y_pred)**(.5)
    print(f'  RMSE: {rmse}\n')

Fold 0:
  Train: index=[13 14 15 16 17 18 19 20 21 22 23]
  Test:  index=[24 25]
  RMSE: 3.3172603907327156

Fold 1:
  Train: index=[15 16 17 18 19 20 21 22 23 24 25]
  Test:  index=[26 27]
  RMSE: 7.6795665442833645

Fold 2:
  Train: index=[17 18 19 20 21 22 23 24 25 26 27]
  Test:  index=[28 29]
  RMSE: 3.8562601506556113

Fold 3:
  Train: index=[19 20 21 22 23 24 25 26 27 28 29]
  Test:  index=[30 31]
  RMSE: 8.771280361928426

Fold 4:
  Train: index=[21 22 23 24 25 26 27 28 29 30 31]
  Test:  index=[32 33]
  RMSE: 9.59500249903982



In [None]:

    # num_shops = int(np.sqrt(test_size))
    # ideal_num_items = int(test_size / num_shops)
    # test_shops = np.random.choice(range(shop_id_min_max[0], 
    #                                     shop_id_min_max[1]+1), 
    #                               num_shops, replace=False)
    # back = []
    # for val in test_shops:
    #     num_items_to_pick = np.random.choice(range(int(ideal_num_items * (1-num_items_variety)),
    #                                             int(ideal_num_items * (1+num_items_variety))))
    #     test_items = np.random.choice(range(item_id_min_max[0], 
    #                                         item_id_min_max[1]+1), 
    #                                 num_items_to_pick, replace=False)
    #     back.append(pd.DataFrame(product([val], test_items), columns = ['shop_id', 'item_id']))
    # back_df = pd.concat(back, ignore_index=True).drop_duplicates()
    # return back_df