In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

import time

warnings.filterwarnings('ignore')

In [2]:
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [3]:
########################### Vars
#################################################################################
TARGET = 'sales'         # Our main target
END_TRAIN = 1913+28         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

In [5]:
%%time
for i in range(1, 1+1):
    
    grid_df = pd.read_pickle('./grid_part_1_update.pkl')

    # We need only 'id','d','sales'
    # to make lags and rollings
    grid_df = grid_df[['id','d','sales']]
    
    SHIFT_DAY = i
    
    start_time = time.time()
    print('Create lags ' + str(SHIFT_DAY))
    
    LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY+15)]
    grid_df = grid_df.assign(**{
            '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
            for l in LAG_DAYS
            for col in [TARGET]
        })
    
    # Minify lag columns
    for col in list(grid_df):
        if 'lag' in col:
            grid_df[col] = grid_df[col].astype(np.float16)
    
    print('%0.2f min: Lags' % ((time.time() - start_time) / 60))
    
    """start_time = time.time()
    print('Create rolling aggs')

    for i in [7,14,21,28,35,42,49,60,180]:
        print('Rolling period:', i)
        grid_df['rolling_mean_'+str(i)] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16)
        grid_df['rolling_std_'+str(i)]  = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(np.float16)
    """
    # Rollings
    # with sliding shift
    for d_shift in [SHIFT_DAY, SHIFT_DAY+2, SHIFT_DAY+6, SHIFT_DAY+13]: 
        print('Shifting period:', d_shift)
        for d_window in [7,14,28,60,180]:
            col_name = 'rolling_mean_tmp_'+str(d_shift)+'_'+str(d_window)
            grid_df[col_name] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(d_shift).rolling(d_window).mean()).astype(np.float16)


    print('%0.2f min: Lags' % ((time.time() - start_time) / 60))
        
    ########################### Export
    #################################################################################
    print('Save lags and rollings')
    grid_df.to_pickle('lags_df_'+str(SHIFT_DAY)+'_dbd.pkl')

Create lags 22
10.66 min: Lags
Shifting period: 22
Shifting period: 24
Shifting period: 28
Shifting period: 35
30.38 min: Lags
Save lags and rollings
Create lags 23


KeyboardInterrupt: 

In [11]:
grid_df.columns

Index(['id', 'd', 'sales', 'sales_lag_1', 'sales_lag_2', 'sales_lag_3',
       'sales_lag_4', 'sales_lag_5', 'sales_lag_6', 'sales_lag_7',
       'sales_lag_8', 'sales_lag_9', 'sales_lag_10', 'sales_lag_11',
       'sales_lag_12', 'sales_lag_13', 'sales_lag_14', 'sales_lag_15',
       'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
       'rolling_mean_21', 'rolling_std_21', 'rolling_mean_28',
       'rolling_std_28', 'rolling_mean_35', 'rolling_std_35',
       'rolling_mean_42', 'rolling_std_42', 'rolling_mean_49',
       'rolling_std_49', 'rolling_mean_tmp_1_7', 'rolling_mean_tmp_1_14',
       'rolling_mean_tmp_1_28', 'rolling_mean_tmp_1_35',
       'rolling_mean_tmp_1_60', 'rolling_mean_tmp_1_180',
       'rolling_mean_tmp_7_7', 'rolling_mean_tmp_7_14',
       'rolling_mean_tmp_7_28', 'rolling_mean_tmp_7_35',
       'rolling_mean_tmp_7_60', 'rolling_mean_tmp_7_180',
       'rolling_mean_tmp_14_7', 'rolling_mean_tmp_14_14',
       'rolling_mean_tmp_14_28', 'ro