In [2]:
from src.settings import RAW_PATH, PROCESSED_PATH
import pandas as pd

# Initial merge of `sales_train` and `items`

In [2]:
sales_train = pd.read_csv(RAW_PATH + 'sales_train.csv')
items = pd.read_csv(RAW_PATH + 'items.csv')[['item_id', 'item_category_id']]

In [11]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [12]:
items.head()

Unnamed: 0,item_id,item_category_id
0,0,40
1,1,76
2,2,40
3,3,40
4,4,40


In [4]:
merged_df = sales_train.merge(items, how='left', on='item_id')

In [7]:
merged_df.isna().sum()

date                0
date_block_num      0
shop_id             0
item_id             0
item_price          0
item_cnt_day        0
item_category_id    0
dtype: int64

In [9]:
merged_df.to_csv(PROCESSED_PATH + 'merged_train_df.csv', index=False)

# Basic dataset transformer

In [47]:
merged_df = pd.read_csv(PROCESSED_PATH + 'merged_train_df.csv')
merged_df['date'] = pd.to_datetime(merged_df['date'], format="%d.%m.%Y")
merged_df = merged_df.sort_values('date')

In [48]:
merged_df

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
49800,2013-01-01,0,18,5823,2500.0,1.0,35
29784,2013-01-01,0,27,5573,849.0,1.0,2
35476,2013-01-01,0,7,1006,399.0,1.0,67
8330,2013-01-01,0,19,17707,899.0,1.0,19
57384,2013-01-01,0,14,19548,149.0,1.0,40
...,...,...,...,...,...,...,...
2885098,2015-10-31,33,41,21386,169.0,1.0,40
2930981,2015-10-31,33,21,988,199.0,1.0,37
2885097,2015-10-31,33,41,21377,169.0,1.0,40
2930993,2015-10-31,33,22,10207,1199.0,1.0,30


## Constructing target

This will be a simpler version of target where instead of rolling window of 30 days we will simply use the month id to sum aggregate number of items sold. We will calculate this for each shop and item, add missing months with 0 sales and shift the aggregate by 1 month.

In [333]:
grouping_cols = ['shop_id', 'item_id', 'date_block_num']
window = merged_df.sort_values(grouping_cols).groupby(grouping_cols)

In [334]:
target_df = window.agg({'item_cnt_day': ['sum']})
target_df = target_df.sort_index()
target_df.columns = ['_'.join(col) for col in target_df.columns.values]
target_df = target_df.reset_index()

In [306]:
# target_df = target_df[((target_df['item_id']==5822) & (target_df['shop_id'] == 2) & (target_df['date_block_num'] < 6)) | ((target_df['item_id']==100) & (target_df['shop_id'] == 57))]

In [335]:
def reindex_group(group: pd.DataFrame) -> pd.DataFrame:
    """Function tb used in group apply clause. Adds target column to the dataset"""
    group = group.set_index('date_block_num')
    new_index = range(group.index.min(), group.index.max() + 2)
    group = group.reindex(new_index)
    group.shop_id = group.shop_id.ffill()
    group.item_id = group.item_id.ffill()
    group.item_cnt_day_sum = group.item_cnt_day_sum.fillna(0)
    group = group.reset_index()
    group['target'] = group['item_cnt_day_sum'].shift(-1)
    group = group.dropna()
    return group

target_df = target_df.groupby(['shop_id', 'item_id']).apply(reindex_group).reset_index(drop=True)

In [345]:
target_df.to_csv(PROCESSED_PATH + 'target_df.csv', index=False)

## Feature engineering for [`shop_id`, `date_block_num`]

In [367]:
list(merged_df)

['date_block_num',
 'shop_id',
 'item_id',
 'item_price',
 'item_cnt_day',
 'item_category_id']

### Counts of items per month and per shop, lags, rolling aggregates

In [585]:
# adding deals count column
deals_cnt_df = merged_df[['date_block_num', 'shop_id', 'item_id']].reset_index(drop=True).copy()
group_cols = ['shop_id', 'date_block_num']
deals_cnt_df = deals_cnt_df.sort_values(group_cols).groupby(group_cols).item_id.count().reset_index().rename(columns={'item_id': 'deals_cnt'})

# calculating lags
deals_cnt_df = deals_cnt_df.set_index('shop_id')
for shift in [1, 3, 7, 12, 20]:
    deals_cnt_df[f'deals_cnt_shift_{shift}'] = deals_cnt_df.groupby('shop_id')['deals_cnt'].shift(periods=shift, fill_value=0)
deals_cnt_df = deals_cnt_df.reset_index()

# calculating rolling window aggregates
deals_cnt_df = deals_cnt_df.sort_values(group_cols)
roll_funcs = ['sum', 'mean', 'std', 'min', 'max']
for func in roll_funcs:
    for win_len in [2, 4, 7, 15, 30]:
        deals_cnt_df[f'deals_cnt_roll_{func}_{win_len}'] = deals_cnt_df.groupby('shop_id').rolling(win_len, min_periods=1)\
                .agg({'deals_cnt': func}).reset_index(drop=True).fillna(0)

In [539]:
deals_cnt_df[deals_cnt_df['shop_id']==59].head()

Unnamed: 0,shop_id,date_block_num,deals_cnt,deals_cnt_shift_1,deals_cnt_shift_3,deals_cnt_shift_7,deals_cnt_shift_12,deals_cnt_shift_20,deals_cnt_sum_3,deals_cnt_sum_6,...,deals_cnt_min_3,deals_cnt_min_6,deals_cnt_min_9,deals_cnt_min_15,deals_cnt_min_20,deals_cnt_max_3,deals_cnt_max_6,deals_cnt_max_9,deals_cnt_max_15,deals_cnt_max_20
1552,59,0,1847,0,0,0,0,0,1847.0,1847.0,...,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0,1847.0
1553,59,1,1696,1847,0,0,0,0,3543.0,3543.0,...,1696.0,1696.0,1696.0,1696.0,1696.0,1847.0,1847.0,1847.0,1847.0,1847.0
1554,59,2,1743,1696,0,0,0,0,5286.0,5286.0,...,1696.0,1696.0,1696.0,1696.0,1696.0,1847.0,1847.0,1847.0,1847.0,1847.0
1555,59,3,1271,1743,1847,0,0,0,4710.0,6557.0,...,1271.0,1271.0,1271.0,1271.0,1271.0,1743.0,1847.0,1847.0,1847.0,1847.0
1556,59,4,1194,1271,1696,0,0,0,4208.0,7751.0,...,1194.0,1194.0,1194.0,1194.0,1194.0,1743.0,1847.0,1847.0,1847.0,1847.0


In [601]:
deals_cnt_df.shape

(1586, 33)

### Aggregates over prices per month and per shop, lags, window aggregates

In [602]:
# adding simple aggregates of prices over various deals
prices_df = merged_df.reset_index()[['shop_id', 'date_block_num', 'item_price']]
group_cols = ['shop_id', 'date_block_num']
simple_agg_funcs = ['sum', 'min', 'max', 'mean', 'std']
prices_df = prices_df.groupby(group_cols).agg({'item_price': simple_agg_funcs}).fillna(0)#.reset_index()
prices_df.columns = ['_'.join(col) for col in prices_df.columns]
prices_df = prices_df.reset_index()
simple_agg_cols = [f'item_price_{agg}' for agg in simple_agg_funcs]

# adding lags
prices_df = prices_df.sort_values(group_cols).set_index('shop_id')
for shift in [1, 3, 7, 12, 20]:
    for col in simple_agg_cols:
        prices_df[f'{col}_shift_{shift}'] = prices_df.groupby('shop_id')[col].shift(periods=shift, fill_value=0)
prices_df = prices_df.reset_index()

# adding window aggregates
prices_df = prices_df.sort_values(group_cols)
roll_funcs = ['sum', 'mean', 'std', 'min', 'max']
cols_to_agg = ['item_price_sum', 'item_price_mean']

for func in roll_funcs:
    for win_len in [2, 4, 7, 15, 30]:
        for col in cols_to_agg:
            prices_df[f'{col}_roll_{func}_{win_len}'] = prices_df.groupby('shop_id').rolling(win_len, min_periods=1)\
                    .agg({col: func}).reset_index(drop=True).fillna(0)


In [603]:
prices_df.shape

(1586, 82)