<a href="https://colab.research.google.com/github/kamalchapagain/Kaggle_competition_solutions/blob/master/M5_forecasting_accuracy_template_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary of this notebook

**This notebook develope the M5-accuracy model, it is divided into four main parts **
- Part 1. Simple Feature Engineering,
    - Sampling of dataset
    - Downcast of memory
    - Melting
    - Statistical features of sales: min/max/mean/std
    - Normalization (min/max scaling)
    - rolling aggregation
    - momentum
- Part 2. Lag Analysis,
- Part 3: Mean encoding (Custom encoding)
    - FE creation approaches
    - Baseline model for FE validation
    - Implementation of PCA for Dimension reduction
    - FE validation by Permutation importance

- Part 4: Final forecasting
    - parallelization for FE
    - get the final_processed datasets
    - model selection (implementation)
    - Train the model (beware of data leakage)
    - Final forecasting


# Part 1. Feature Engineering

In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

from math import ceil

from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

In [2]:
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

# Part 1.1 Downcating the memory 

In [3]:
## Downcating the memory 
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
## Merging by concat to not lose dtypes
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1

In [6]:
########################### Vars ##############
TARGET = 'sales'         # Our main target
END_TRAIN = 1913         # Last day in train set, However new dataset contains upto d_1941
MAIN_INDEX = ['id','d']  # We can identify item by these columns

**load dataset from location**
- path='/content/drive/My Drive/GoogleColab/'

In [31]:
########################### Load Data#####################
print('Loading the sampled Data')
path='/content/drive/My Drive/GoogleColab/'
train_df = pd.read_csv(path+'sales_train_evaluation.csv')
prices_df = pd.read_csv(path+'sell_prices.csv')
calendar_df =pd.read_csv(path+'calendar.csv')

Loading the sampled Data


In [32]:
train_df.head(1)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,3,0,1,1,1,3,0,1,1,0,0,0,2,0,3,5,0,0,1,1,0,2,1,2,2,1,0,2,4,0,0,0,0,3,3,0,1


**This this dataset upto 1913 is used for validation purpose, therefor, for simplicity to undersand, we are going to change the column *id* contains *_evaluation into **_validation**

In [33]:
import re
train_df.id.replace({'evaluation':'validation'}, regex=True, inplace=True)

# Part 1.2 Sampling of dataset

***Due to computational limitations, we are just taking 10% of dataset. Few bloggers talking about 'Stratified sampling', however we implement train_test_split strategy with random state.***

In [34]:
# Reduction of dataset (just take 10% of dataset)
from sklearn.model_selection import train_test_split

def sampling_dataset(dataset):
  data_keep, data_discard=train_test_split(dataset,train_size=0.1, random_state=42)
  print('Total size and sampled size')
  print(dataset.shape,data_keep.shape)
  data_keep.sort_index(inplace=True)
  data_keep.reset_index(drop=True, inplace=True)
  return data_keep

train_df=sampling_dataset(train_df)
prices_df=sampling_dataset(prices_df)
train_df_for_lag_feature=train_df.copy()


# train_df = sales_keep.copy() #pd.read_pickle(path+'sales_sampled.pkl')
# prices_df = prices_keep.copy() #pd.read_pickle(path+'prices_sampled.pkl')

Total size and sampled size
(30490, 1947) (3049, 1947)
Total size and sampled size
(6841121, 4) (684112, 4)


In [35]:
#Garbage collection (memory management)
gc.collect()

609

# Part 1.3  Melting the dataset
- In this section the we transform the horizontal dataset into the vertical view.

- Out index will be *'id'*, *'item_id'*, *'dept_id'*, *'cat_id'*, *'store_id'*, *'state_id'* and labels are *'d_'* coulmns

In [36]:
########################### Making Grids (for prices/sales) ###################################
print('Melting the dataset to different grids')

index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
grid_df = pd.melt(train_df, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)


#After melting, lets observe the train rows and columns
print('Train rows:', len(train_df), len(grid_df))


# To be able to make predictions we need to add "test set" to our grid
add_grid = pd.DataFrame()
for i in range(1,29):
    temp_df = train_df[index_columns]
    temp_df = temp_df.drop_duplicates()
    temp_df['d'] = 'd_'+ str(END_TRAIN+i)
    temp_df[TARGET] = np.nan
    add_grid = pd.concat([add_grid,temp_df])

grid_df = pd.concat([grid_df,add_grid])
grid_df = grid_df.reset_index(drop=True)

# Remove some temoprary DFs
del temp_df, add_grid

# We will not need original train_df anymore and can remove it
del train_df

# Let's check our memory usage
print("{:>20}: {:>8}".format('Size of grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))

Melting the dataset to different grids
Train rows: 3049 5918109
     Size of grid_df: 366.4MiB


*We can free some memory by converting "strings" to categorical which will not affect merging and we will not lose any valuable data *

In [37]:
for col in index_columns:
    grid_df[col] = grid_df[col].astype('category')

# Let's check again memory usage
print("{:>20}: {:>8}".format('Reduced grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))

     Reduced grid_df: 137.6MiB


- ***The important thing that need to understand is that the leadings zero values in each*** *train_df*  ***item row are not real 0 sales but mean absence for the item in the store. Such zeros can remove to save some memory***


In [40]:
########################### Product Release date ###############################
print('Release week')

# Prices are set by week so it we will have not very accurate release week 
release_df = prices_df.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
release_df.columns = ['store_id','item_id','release']

# Now we can merge release_df
grid_df = merge_by_concat(grid_df, release_df, ['store_id','item_id'])
del release_df

# We want to remove some "zeros" rows from grid_df to do it we need wm_yr_wk column
# let's merge partly calendar_df to have it
grid_df = merge_by_concat(grid_df, calendar_df[['wm_yr_wk','d']], ['d'])
                      
# Now we can cutoff some rows and safe memory 
grid_df = grid_df[grid_df['wm_yr_wk']>=grid_df['release']]
grid_df = grid_df.reset_index(drop=True)

# Let's check our memory usage
print("{:>20}: {:>8}".format('Original grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))

# Should we keep release week as one of the features? for now, lets keep release feature by reducing the memory size int16.

grid_df['release'] = grid_df['release'] - grid_df['release'].min()
grid_df['release'] = grid_df['release'].astype(np.int16)

# Let's check again memory usage
print("{:>20}: {:>8}".format('Reduced grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))

Release week
    Original grid_df: 174.9MiB
     Reduced grid_df: 148.7MiB


In [41]:
########################### Save part 1 ################
print('Save Part 1')
print('Part 1 is ready')

# We have our BASE grid ready and can save it as pickle file for future use (model training)
grid_df.to_pickle('/content/drive/My Drive/GoogleColab/pickle_files/grid_part_1.pkl')

print('Size:', grid_df.shape)
grid_part_1=grid_df.copy() #save in the name of grid_part_1

Save Part 1
Part 1 is ready
Size: (4578794, 10)


In [42]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4578794 entries, 0 to 4578793
Data columns (total 10 columns):
 #   Column    Dtype   
---  ------    -----   
 0   id        category
 1   item_id   category
 2   dept_id   category
 3   cat_id    category
 4   store_id  category
 5   state_id  category
 6   d         object  
 7   sales     float64 
 8   release   int16   
 9   wm_yr_wk  int64   
dtypes: category(6), float64(1), int16(1), int64(1), object(1)
memory usage: 148.7+ MB


#Part 2. Feature Engineering
- **Some basic FE for prices are:** 
- statistical feature for prices are: min_price, max_price, average price, deviation in price
- Normalization (min/max scaling)
- "rolling" aggregations but would like months and years as "window"
- Momentum of the prices


In [None]:
########################### Prices ############################################
print('Feature Engineering of Prices')

# We can do some basic aggregations
prices_df['price_max'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('max')
prices_df['price_min'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('min')
prices_df['price_std'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('std')
prices_df['price_mean'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('mean')

# and do price normalization (min/max scaling)
prices_df['price_norm'] = prices_df['sell_price']/prices_df['price_max']

# Some items are can be inflation dependent and some items are very "stable"
prices_df['price_nunique'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
prices_df['item_nunique'] = prices_df.groupby(['store_id','sell_price'])['item_id'].transform('nunique')

# we would like some "rolling" aggregations but would like months and years as "window"
calendar_prices = calendar_df[['wm_yr_wk','month','year']]
calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
prices_df = prices_df.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')
del calendar_prices

# Now we can add price "momentum" (some sort of) Shifted by week, by month mean, and by year mean
prices_df['price_momentum'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
prices_df['price_momentum_m'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
prices_df['price_momentum_y'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

del prices_df['month'], prices_df['year']


Feature Engineering of Prices


In [43]:
########################### Merge prices and save part 2 #################################################
print('Merge prices and save part 2')

# Merge Prices
original_columns = list(grid_df)
grid_df = grid_df.merge(prices_df, on=['store_id','item_id','wm_yr_wk'], how='left')
keep_columns = [col for col in list(grid_df) if col not in original_columns]
grid_df = grid_df[MAIN_INDEX+keep_columns]
grid_df = reduce_mem_usage(grid_df)

# Safe part 2
grid_df.to_pickle('/content/drive/My Drive/GoogleColab/pickle_files/grid_part_2.pkl')
print('Size:', grid_df.shape)
grid_part_2=grid_df.copy()

# We don't need prices_df anymore
del prices_df


Merge prices and save part 2
Mem. usage decreased to 87.43 Mb (23.1% reduction)
Size: (4578794, 3)


In [44]:
grid_part_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4578794 entries, 0 to 4578793
Data columns (total 3 columns):
 #   Column      Dtype   
---  ------      -----   
 0   id          category
 1   d           object  
 2   sell_price  float16 
dtypes: category(1), float16(1), object(1)
memory usage: 87.4+ MB


In [45]:
calendar_df.columns

Index(['date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'd',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI'],
      dtype='object')

In [46]:
########################### Lets Merge the calendar ###########################

grid_df = grid_df[MAIN_INDEX]

# Merge calendar partly 
icols = ['date',
         'd',
         'event_name_1',
         'event_type_1',
         'event_name_2',
         'event_type_2',
         'snap_CA',
         'snap_TX',
         'snap_WI']

grid_df = grid_df.merge(calendar_df[icols], on=['d'], how='left')

# Again, lets minify data
# 'snap_' columns we can convert to bool or int8
icols = ['event_name_1',
         'event_type_1',
         'event_name_2',
         'event_type_2',
         'snap_CA',
         'snap_TX',
         'snap_WI']
for col in icols:
    grid_df[col] = grid_df[col].astype('category')

# Convert to DateTime
grid_df['date'] = pd.to_datetime(grid_df['date'])

# Make some features from date
grid_df['tm_d'] = grid_df['date'].dt.day.astype(np.int8)
grid_df['tm_w'] = grid_df['date'].dt.week.astype(np.int8)
grid_df['tm_m'] = grid_df['date'].dt.month.astype(np.int8)
grid_df['tm_y'] = grid_df['date'].dt.year
grid_df['tm_y'] = (grid_df['tm_y'] - grid_df['tm_y'].min()).astype(np.int8)
grid_df['tm_wm'] = grid_df['tm_d'].apply(lambda x: ceil(x/7)).astype(np.int8)

grid_df['tm_dw'] = grid_df['date'].dt.dayofweek.astype(np.int8)
grid_df['tm_w_end'] = (grid_df['tm_dw']>=5).astype(np.int8)

# Remove date
del grid_df['date']

In [47]:
########################### Save part 3 (Dates) ###########################
print('Save part 3')

grid_df.to_pickle('/content/drive/My Drive/GoogleColab/pickle_files/grid_part_3.pkl')
print('Size:', grid_df.shape)

grid_part_3=grid_df.copy()

del calendar_df
del grid_df

Save part 3
Size: (4578794, 16)


In [48]:
grid_part_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4578794 entries, 0 to 4578793
Data columns (total 16 columns):
 #   Column        Dtype   
---  ------        -----   
 0   id            category
 1   d             object  
 2   event_name_1  category
 3   event_type_1  category
 4   event_name_2  category
 5   event_type_2  category
 6   snap_CA       category
 7   snap_TX       category
 8   snap_WI       category
 9   tm_d          int8    
 10  tm_w          int8    
 11  tm_m          int8    
 12  tm_y          int8    
 13  tm_wm         int8    
 14  tm_dw         int8    
 15  tm_w_end      int8    
dtypes: category(8), int8(7), object(1)
memory usage: 139.8+ MB


In [49]:
########################### Some additional cleaning (converting 'd' into int)
#################################################################################

## Part 1
# Convert 'd' to int
grid_df = grid_part_1.copy()
grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)

path='/content/drive/My Drive/GoogleColab/pickle_files/'
grid_df.to_pickle(path+'grid_part_1.pkl')

# Remove 'wm_yr_wk'
# as test values are not in train set
del grid_df['wm_yr_wk']
grid_part_1=grid_df.copy()

del grid_df


In [None]:
# #Path for pickle file
# path='/content/drive/My Drive/GoogleColab/pickle_files/'
# grid_part_1.to_pickle(path+'grid_part_1.pkl')
# grid_part_2.to_pickle(path+'grid_part_2.pkl')
# grid_part_3.to_pickle(path+'grid_part_3.pkl')


In [None]:
# #Lets read same 3 sets of features
# grid_df = pd.concat([pd.read_pickle(path+'grid_part_1.pkl'),
#                      pd.read_pickle(path+'grid_part_2.pkl').iloc[:,2:],
#                      pd.read_pickle(path+'grid_part_3.pkl').iloc[:,2:]],
#                      axis=1)

In [50]:
# lets concat all three grids for further processing

grid_df=pd.concat([grid_part_1, grid_part_2.iloc[:,2:], grid_part_3.iloc[:,2:]], axis=1)

# Let's check again memory usage
print("{:>20}: {:>8}".format('Full Grid',sizeof_fmt(grid_df.memory_usage(index=True).sum())))
print('Size:', grid_df.shape)

#           Full Grid:   157.4 MiB

#Still memory usage is high, we can train by state_id or shop_id !!

state_id = 'CA'
grid_df = grid_df[grid_df['state_id']==state_id]
print("{:>20}: {:>8}".format('Full Grid',sizeof_fmt(grid_df.memory_usage(index=True).sum())))
# Full Grid:  75.3MiB

store_id = 'CA_1'
grid_df = grid_df[grid_df['store_id']==store_id]
print("{:>20}: {:>8}".format('Full Grid',sizeof_fmt(grid_df.memory_usage(index=True).sum())))
#           Now, Full Grid: 19.2 MiB


           Full Grid: 157.4MiB
Size: (4578794, 24)
           Full Grid:  75.3MiB
           Full Grid:  19.2MiB


In [51]:
########################### Final list of features
#################################################################################
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 451999 entries, 0 to 4576039
Data columns (total 24 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   id            451999 non-null  category
 1   item_id       451999 non-null  category
 2   dept_id       451999 non-null  category
 3   cat_id        451999 non-null  category
 4   store_id      451999 non-null  category
 5   state_id      451999 non-null  category
 6   d             451999 non-null  int16   
 7   sales         443739 non-null  float64 
 8   release       451999 non-null  int16   
 9   sell_price    46182 non-null   float16 
 10  event_name_1  37127 non-null   category
 11  event_type_1  37127 non-null   category
 12  event_name_2  863 non-null     category
 13  event_type_2  863 non-null     category
 14  snap_CA       451999 non-null  category
 15  snap_TX       451999 non-null  category
 16  snap_WI       451999 non-null  category
 17  tm_d          451999 non-nul

# Part 2.1 Encoding (Lag Feature)
- Rolling lag

https://www.kaggle.com/kyakovlev/m5-lags-features


In [52]:
#Lets load the sales dataset
train_df=train_df_for_lag_feature.copy() #lets save one file at the begining. This file will be needed in future.

In [53]:
# To make all calculations faster, we will limit dataset by 'CA' state
train_df = train_df[train_df['state_id']=='CA']
print('Shape is: %s' %str(train_df.shape))

Shape is: (1202, 1947)


*** Here the number of features are d_1 to d_1941 (but we will consider upto d_1912) i.e a lot of feature, which is good, but we have just 1202 training rows (however we are taking only for CA) ***

*** In other hand we can think of d_ columns as additional labels and can significantly scale up our training set*** 

In [54]:
## Horizontal representation to vertical representation (tranformation)
train_df.iloc[:10]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,...,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,2,0,2,0,2,1,0,0,0,0,0,0,0,0,2,0,2,1,0,0,1,0,0,1,0,2,1,1,0,0,1
1,HOBBIES_1_012_CA_1_validation,HOBBIES_1_012,HOBBIES_1,HOBBIES,CA_1,CA,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,2,0,3,0,1,1,3,1,1,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,1,0
2,HOBBIES_1_014_CA_1_validation,HOBBIES_1_014,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,4,3,0,2,2,1,2,0,0,3,1,1,1,0,0,2,3,0,3,5,1,3,1,4,1,2,1,1,0,2,1,1,1,0,2,2,1,1,1,3
3,HOBBIES_1_056_CA_1_validation,HOBBIES_1_056,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,2,2,2,4,0,1,1,1,1,0,1,0,1,0,0,6,4,2,0,0,0,0,0,4,3,2,2,4,1,1,1,0,0,0,0,1,0,0,0
4,HOBBIES_1_062_CA_1_validation,HOBBIES_1_062,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,1
5,HOBBIES_1_080_CA_1_validation,HOBBIES_1_080,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,1,0,0,0,2,0,0,0,0,0,9,0,0,0,0,1,0,0,0,0,0,0,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,12,2,0,0,7,0,0,4,0,0,0,13,5,0,0,0,3,0,5,0,2,0,0,3,4,1,1,3,0,0,0,0,0
6,HOBBIES_1_092_CA_1_validation,HOBBIES_1_092,HOBBIES_1,HOBBIES,CA_1,CA,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,HOBBIES_1_110_CA_1_validation,HOBBIES_1_110,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,...,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,2,0,0,0,0,0,0,0
8,HOBBIES_1_122_CA_1_validation,HOBBIES_1_122,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,5,2,11,2,0,7,3,2,0,0,0,9,0,3,0,1,4,1,0,0,0,0,4
9,HOBBIES_1_130_CA_1_validation,HOBBIES_1_130,HOBBIES_1,HOBBIES,CA_1,CA,4,3,4,3,0,1,0,0,0,3,0,0,2,4,0,3,0,0,0,1,2,5,0,3,2,1,0,2,1,1,1,0,2,1,...,0,3,1,0,0,3,1,4,0,2,0,1,1,0,0,0,1,2,0,0,0,0,5,2,0,1,0,0,0,3,1,0,1,0,0,1,0,3,1,3


***Lets go for Vertical representation:***

- **In other hand we can think of d_ columns as additional labels and can significantly** 
- **scale up our training set to 2333082 rows with 8 columns**

- **Good thing that our model will have greater input for training**

- **Bad thing that we are losing lags that we had in horizontal representation and**
- **also new data set consumes much more memory**

In [55]:
index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
train_df = pd.melt(train_df, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)

train_df[train_df['id']=='HOBBIES_1_010_CA_1_validation'].iloc[:10]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1202,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_2,0
2404,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_3,1
3606,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_4,0
4808,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_5,0
6010,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_6,0
7212,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_7,0
8414,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_8,0
9616,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_9,0
10818,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_10,0


*** Lags creation ***
- **For the lag creation our dataset is allready sorted by d values**
- **now, we can simply shift() values also we have to keep in mind that we need to aggregate values on 'id' level**


In [57]:
########################### Lags creation ##########################################################

# group and shift in loop
temp_df = train_df[['id','d',TARGET]]

start_time = time.time()
for i in range(1,8):
    print('Shifting:', i)
    temp_df['lag_'+str(i)] = temp_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(i))
    
print('%0.2f min: Time for loops' % ((time.time() - start_time) / 60))


# Or same in "compact" manner
LAG_DAYS = [col for col in range(1,8)]
temp_df = train_df[['id','d',TARGET]]

start_time = time.time()
temp_df = temp_df.assign(**{
        '{}_lag_{}'.format(col, l): temp_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in LAG_DAYS
        for col in [TARGET]
    })

print('%0.2f min: Time for bulk shift' % ((time.time() - start_time) / 60))

Shifting: 1
Shifting: 2
Shifting: 3
Shifting: 4
Shifting: 5
Shifting: 6
Shifting: 7
0.15 min: Time for loops
0.15 min: Time for bulk shift


In [62]:
temp_df

Unnamed: 0,id,d,sales,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,sales_lag_5,sales_lag_6,sales_lag_7
0,HOBBIES_1_010_CA_1_validation,d_1,0,,,,,,,
1,HOBBIES_1_012_CA_1_validation,d_1,0,,,,,,,
2,HOBBIES_1_014_CA_1_validation,d_1,0,,,,,,,
3,HOBBIES_1_056_CA_1_validation,d_1,0,,,,,,,
4,HOBBIES_1_062_CA_1_validation,d_1,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
2333077,FOODS_3_807_CA_4_validation,d_1941,1,4.0,2.0,1.0,1.0,3.0,4.0,0.0
2333078,FOODS_3_810_CA_4_validation,d_1941,2,5.0,1.0,2.0,1.0,1.0,2.0,5.0
2333079,FOODS_3_814_CA_4_validation,d_1941,3,1.0,2.0,1.0,0.0,1.0,1.0,1.0
2333080,FOODS_3_815_CA_4_validation,d_1941,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


***We can notice many NaNs values - which is normal because there is no data for day 0,-1,-2 (out of dataset time periods)

***Same works for test set*** 

**be careful to make lag features:**
- *for day 1920 there is no data about day 1919 (until 1913)
- *So if we want to predict day 1915, our lag features have to start from 2 (1915 (which is the predicting day)-1913 (the last day with label in dataset), and so on*

In [63]:
# The result
temp_df[temp_df['id']=='HOBBIES_1_010_CA_1_validation'].iloc[:10]


Unnamed: 0,id,d,sales,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,sales_lag_5,sales_lag_6,sales_lag_7
0,HOBBIES_1_010_CA_1_validation,d_1,0,,,,,,,
1202,HOBBIES_1_010_CA_1_validation,d_2,0,0.0,,,,,,
2404,HOBBIES_1_010_CA_1_validation,d_3,1,0.0,0.0,,,,,
3606,HOBBIES_1_010_CA_1_validation,d_4,0,1.0,0.0,0.0,,,,
4808,HOBBIES_1_010_CA_1_validation,d_5,0,0.0,1.0,0.0,0.0,,,
6010,HOBBIES_1_010_CA_1_validation,d_6,0,0.0,0.0,1.0,0.0,0.0,,
7212,HOBBIES_1_010_CA_1_validation,d_7,0,0.0,0.0,0.0,1.0,0.0,0.0,
8414,HOBBIES_1_010_CA_1_validation,d_8,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9616,HOBBIES_1_010_CA_1_validation,d_9,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10818,HOBBIES_1_010_CA_1_validation,d_10,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


***Rolling lags***

- *Because of computational limitation, we restored few days of sales values from horizontal representation as lag features* 

- *lambda x: x.shift(1)==> 1 day shift will serve only to predict day 1914

- *for other days we have to shift PREDICT_DAY-1913*

- *Lets go for rolling aggregation* 

In [64]:
########################### Rolling lags ###########################################################

## Rolling aggregations



# Such aggregations will help us to restore
# at least part of the information for our model

temp_df = train_df[['id','d','sales']]

start_time = time.time()

for i in [14,30,60]:
    print('Rolling period:', i)
    temp_df['rolling_mean_'+str(i)] = temp_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    temp_df['rolling_std_'+str(i)]  = temp_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())

# we can also aggregate by max/skew/median etc 

print('%0.2f min: Time for loop' % ((time.time() - start_time) / 60))

Rolling period: 14
Rolling period: 30
Rolling period: 60
0.19 min: Time for loop


In [65]:
104/6

17.333333333333332

In [66]:
# The result 
temp_df[temp_df['id']=='HOBBIES_1_012_CA_1_validation'].iloc[:20]

# Same as before, the NaNs values - is is because of absence of data for 
# 0*(rolling_period),-1*(rolling_period),-2*(rolling_period)

Unnamed: 0,id,d,sales,rolling_mean_14,rolling_std_14,rolling_mean_30,rolling_std_30,rolling_mean_60,rolling_std_60
1,HOBBIES_1_012_CA_1_validation,d_1,0,,,,,,
1203,HOBBIES_1_012_CA_1_validation,d_2,2,,,,,,
2405,HOBBIES_1_012_CA_1_validation,d_3,0,,,,,,
3607,HOBBIES_1_012_CA_1_validation,d_4,0,,,,,,
4809,HOBBIES_1_012_CA_1_validation,d_5,0,,,,,,
6011,HOBBIES_1_012_CA_1_validation,d_6,0,,,,,,
7213,HOBBIES_1_012_CA_1_validation,d_7,0,,,,,,
8415,HOBBIES_1_012_CA_1_validation,d_8,2,,,,,,
9617,HOBBIES_1_012_CA_1_validation,d_9,0,,,,,,
10819,HOBBIES_1_012_CA_1_validation,d_10,0,,,,,,


***'Memory usage' minify mechanism***

In [67]:
########################### Memory ussage ###################################################
# Let's check our memory usage
print("{:>20}: {:>8}".format('Original rolling df',sizeof_fmt(temp_df.memory_usage(index=True).sum())))

# can we minify it?
# 1. if our dataset are aligned by index 
#    we don't need 'id' 'd' 'sales' columns
temp_df = temp_df.iloc[:,3:]
print("{:>20}: {:>8}".format('Values rolling df',sizeof_fmt(temp_df.memory_usage(index=True).sum())))

# can we make it even smaller?
# yes, carefully change dtype and/or use sparce matrix to minify 0s
# Also note that lgbm accepts matrixes as input that is good for memory reducion 
from scipy import sparse 
temp_matrix = sparse.csr_matrix(temp_df)

# restore to df
temp_matrix_restored = pd.DataFrame(temp_matrix.todense())
restored_cols = ['roll_' + str(i) for i in list(temp_matrix_restored)]
temp_matrix_restored.columns = restored_cols

 Original rolling df: 160.2MiB
   Values rolling df: 106.8MiB


In [68]:
########################### Remove old objects
#################################################################################
del temp_df, train_df, temp_matrix, temp_matrix_restored

In [69]:
########################### Apply on grid_df #####################################################
# lets read first grid from 
grid_df = pd.read_pickle('/content/drive/My Drive/GoogleColab/pickle_files/grid_part_1.pkl')

# We need only 'id','d','sales'
# to make lags and rollings
grid_df = grid_df[['id','d','sales']]
SHIFT_DAY = 28

# Lags
# with 28 day shift
start_time = time.time()
print('Create lags')

LAG_DAYS = [col for col in range(SHIFT_DAY,SHIFT_DAY+15)]
grid_df = grid_df.assign(**{
        '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in LAG_DAYS
        for col in [TARGET]
    })

# Minify lag columns
for col in list(grid_df):
    if 'lag' in col:
        grid_df[col] = grid_df[col].astype(np.float16)

print('%0.2f min: Lags' % ((time.time() - start_time) / 60))

# Rollings
# with 28 day shift
start_time = time.time()
print('Create rolling aggs')

for i in [7,14,30,60,180]:
    print('Rolling period:', i)
    grid_df['rolling_mean_'+str(i)] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16)
    grid_df['rolling_std_'+str(i)]  = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(np.float16)

# Rollings
# with sliding shift
for d_shift in [1,7,14]: 
    print('Shifting period:', d_shift)
    for d_window in [7,14,30,60]:
        col_name = 'rolling_mean_tmp_'+str(d_shift)+'_'+str(d_window)
        grid_df[col_name] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(d_shift).rolling(d_window).mean()).astype(np.float16)
    
    
print('%0.2f min: Lags' % ((time.time() - start_time) / 60))

Create lags
0.51 min: Lags
Create rolling aggs
Rolling period: 7
Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 180
Shifting period: 1
Shifting period: 7
Shifting period: 14
1.18 min: Lags


In [70]:
########################### Export ###############################################
path='/content/drive/My Drive/GoogleColab/pickle_files/'
print('Save lags and rollings')
grid_df.to_pickle(path+'lags_df_'+str(SHIFT_DAY)+'.pkl')

Save lags and rollings


In [71]:
########################### Final list of new features ################################################
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4578794 entries, 0 to 4578793
Data columns (total 40 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   id                      category
 1   d                       int16   
 2   sales                   float64 
 3   sales_lag_28            float16 
 4   sales_lag_29            float16 
 5   sales_lag_30            float16 
 6   sales_lag_31            float16 
 7   sales_lag_32            float16 
 8   sales_lag_33            float16 
 9   sales_lag_34            float16 
 10  sales_lag_35            float16 
 11  sales_lag_36            float16 
 12  sales_lag_37            float16 
 13  sales_lag_38            float16 
 14  sales_lag_39            float16 
 15  sales_lag_40            float16 
 16  sales_lag_41            float16 
 17  sales_lag_42            float16 
 18  rolling_mean_7          float16 
 19  rolling_std_7           float16 
 20  rolling_mean_14         float16 
 21  rolling_

# Part 3: Mean encoding (Custom encoding)
- 1. FE creation approaches
- 2. Sequential FE validation
- 3. Dimension reduction
- 4. FE validation by Permutation importance
- 5. Mean encodings
- 6. Parallelization for FE

In [72]:
#read data
#Lets read same 3 sets of features
path='/content/drive/My Drive/GoogleColab/pickle_files/'
grid_df = pd.concat([pd.read_pickle(path+'grid_part_1.pkl'),
                     pd.read_pickle(path+'grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle(path+'grid_part_3.pkl').iloc[:,2:]],
                     axis=1)

 ## OR
#grid_df=pd.concat([grid_part_1, grid_part_2.iloc[:,2:], grid_part_3.iloc[:,2:]], axis=1)


In [74]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4578794 entries, 0 to 4578793
Data columns (total 25 columns):
 #   Column        Dtype   
---  ------        -----   
 0   id            category
 1   item_id       category
 2   dept_id       category
 3   cat_id        category
 4   store_id      category
 5   state_id      category
 6   d             int16   
 7   sales         float64 
 8   release       int16   
 9   wm_yr_wk      int64   
 10  sell_price    float16 
 11  event_name_1  category
 12  event_type_1  category
 13  event_name_2  category
 14  event_type_2  category
 15  snap_CA       category
 16  snap_TX       category
 17  snap_WI       category
 18  tm_d          int8    
 19  tm_w          int8    
 20  tm_m          int8    
 21  tm_y          int8    
 22  tm_wm         int8    
 23  tm_dw         int8    
 24  tm_w_end      int8    
dtypes: category(13), float16(1), float64(1), int16(2), int64(1), int8(7)
memory usage: 192.3 MB


In [75]:
# Subsampling
# to make all calculations faster.
# Keep only 5% of original ids.
keep_id = np.array_split(list(grid_df['id'].unique()), 20)[0]
grid_df = grid_df[grid_df['id'].isin(keep_id)].reset_index(drop=True)

# Let's "inspect" our grid DataFrame
grid_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300865 entries, 0 to 300864
Data columns (total 25 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   id            300865 non-null  category
 1   item_id       300865 non-null  category
 2   dept_id       300865 non-null  category
 3   cat_id        300865 non-null  category
 4   store_id      300865 non-null  category
 5   state_id      300865 non-null  category
 6   d             300865 non-null  int16   
 7   sales         296581 non-null  float64 
 8   release       300865 non-null  int16   
 9   wm_yr_wk      300865 non-null  int64   
 10  sell_price    30956 non-null   float16 
 11  event_name_1  24786 non-null   category
 12  event_type_1  24786 non-null   category
 13  event_name_2  612 non-null     category
 14  event_type_2  612 non-null     category
 15  snap_CA       300865 non-null  category
 16  snap_TX       300865 non-null  category
 17  snap_WI       300865 non-null

In [76]:
grid_df.head(2)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,wm_yr_wk,sell_price,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,tm_d,tm_w,tm_m,tm_y,tm_wm,tm_dw,tm_w_end
0,HOBBIES_1_303_CA_1_validation,HOBBIES_1_303,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,0,11101,8.976562,,,,,0,0,0,29,4,1,0,5,5,1
1,HOUSEHOLD_1_401_CA_1_validation,HOUSEHOLD_1_401,HOUSEHOLD_1,HOUSEHOLD,CA_1,CA,1,1.0,0,11101,8.867188,,,,,0,0,0,29,4,1,0,5,5,1


# Part 3.1 Baseline model

In [77]:
########################### Baseline model ##########################################

# We will need some global VARS for future

SEED = 142             # Our random seed for everything
random.seed(SEED)      # to make all tests "deterministic"
np.random.seed(SEED)
N_CORES = psutil.cpu_count()     # Available CPU cores (we have just 2)

TARGET = 'sales'      # Our Target
END_TRAIN = 1913      # And we will use last 28 days as validation

# Drop some items from "TEST" set part (1914...)
grid_df = grid_df[grid_df['d']<=END_TRAIN].reset_index(drop=True)

# Features that we want to exclude from training
remove_features = ['id','d',TARGET]

# Our baseline model serves to do fast checks of new features performance 

# We will use LightGBM for our tests
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',         # Standart boosting type
                    'objective': 'regression',       # Standart loss for RMSE
                    'metric': ['rmse'],              # as we will use rmse as metric "proxy"
                    'subsample': 0.8,                
                    'subsample_freq': 1,
                    'learning_rate': 0.05,           # 0.5 is "fast enough" for us
                    'num_leaves': 2**7-1,            # We will need model only for fast check
                    'min_data_in_leaf': 2**8-1,      # So we want it to train faster even with drop in generalization 
                    'feature_fraction': 0.8,
                    'n_estimators': 5000,            # We don't want to limit training (we can change 5000 to any big enough number)
                    'early_stopping_rounds': 30,     # We will stop training almost immediately (if it stops improving) 
                    'seed': SEED,
                    'verbose': -1,
                } 


In [78]:
## RMSE
def rmse(y, y_pred):
    return np.sqrt(np.mean(np.square(y - y_pred)))

# Small function to make fast features tests
# estimator = make_fast_test(grid_df)
# it will return lgb booster for future analisys
def make_fast_test(df):

    features_columns = [col for col in list(df) if col not in remove_features]

    tr_x, tr_y = df[df['d']<=(END_TRAIN-28)][features_columns], df[df['d']<=(END_TRAIN-28)][TARGET]              
    vl_x, v_y = df[df['d']>(END_TRAIN-28)][features_columns], df[df['d']>(END_TRAIN-28)][TARGET]
    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)
    
    estimator = lgb.train(
                            lgb_params,
                            train_data,
                            valid_sets = [train_data,valid_data],
                            verbose_eval = 500,
                        )
    
    return estimator

# Make baseline model
baseline_model = make_fast_test(grid_df)



Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[307]	training's rmse: 1.88327	valid_1's rmse: 1.85104


In [79]:
########################### Lets test our normal Lags (7 days)###########################

# Small helper to make lags creation faster
from multiprocessing import Pool                # Multiprocess Runs

## Multiprocessing Run.
# :t_split - int of lags days                   # type: int
# :func - Function to apply on each split       # type: python function

## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

def make_normal_lag(lag_day):
    lag_df = grid_df[['id','d',TARGET]] 
    col_name = 'sales_lag_'+str(lag_day)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(lag_day)).astype(np.float16)
    return lag_df[[col_name]]

# Launch parallel lag creation
# and "append" to our grid
LAGS_SPLIT = [col for col in range(1,1+7)]
grid_df = pd.concat([grid_df, df_parallelize_run(make_normal_lag,LAGS_SPLIT)], axis=1)

# Make features test
test_model = make_fast_test(grid_df)


Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[179]	training's rmse: 1.79122	valid_1's rmse: 1.76335


In [80]:
########################### Permutation importance Test
########################### https://www.kaggle.com/dansbecker/permutation-importance @dansbecker
#################################################################################

# Let's creat validation dataset and features
features_columns = [col for col in list(grid_df) if col not in remove_features]
validation_df = grid_df[grid_df['d']>(END_TRAIN-28)].reset_index(drop=True)

# Make normal prediction with our model and save score
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)


# Now we are looping over all our numerical features
for col in features_columns:
    
    # We will make validation set copy to restore
    # features states on each run
    temp_df = validation_df.copy()
    
    # Error here appears if we have "categorical" features and can't 
    # do np.random.permutation without disrupt categories
    # so we need to check if feature is numerical
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        
        # If our current rmse score is less than base score
        # it means that feature most probably is a bad one
        # and our model is learning on noise
        print(col, np.round(cur_score - base_score, 4))


Standart RMSE 1.7633459853087066
release -0.0001
wm_yr_wk 0.0
sell_price 0.0
tm_d 0.0124
tm_w -0.0001
tm_m -0.0001
tm_y 0.0
tm_wm 0.0004
tm_dw 0.1121
tm_w_end 0.0061
sales_lag_1 0.2887
sales_lag_2 0.0363
sales_lag_3 0.0426
sales_lag_4 0.0393
sales_lag_5 0.0077
sales_lag_6 0.012
sales_lag_7 0.0451


# Insights:
- Lags with 1 days shift (nearest past) shows the important features while other features not much. 

- Better to test with several Permutation for confirmation
(https://www.kaggle.com/dansbecker/permutation-importance @dansbecker)

- price_nunique -0.002 : strong negative values are most probably noise

- The idea is the following: feature importance can be measured by looking at how much the score (accuracy, mse, rmse, mae, etc. - any score we’re interested in) decreases when a feature is not available.

- To do that one can remove feature from the dataset, re-train the estimator and check the score. But it requires re-training an estimator for each feature, which can be computationally intensive. Also, it shows what may be important within a dataset, not what is important within a concrete trained model.

- To avoid re-training the estimator we can remove a feature only from the test part of the dataset, and compute score without using this feature. 

- Instead of removing a feature we can replace it with random noise - feature column is still there, but it no longer contains useful information. 

- This method works if noise is drawn from the same distribution as original feature values (as otherwise estimator may fail). 

In [81]:
# Remove Temp data
del temp_df, validation_df

# Remove test features
# As we will compare performance with baseline model for now
keep_cols = [col for col in list(grid_df) if 'sales_lag_' not in col]
grid_df = grid_df[keep_cols]

In [82]:

LAGS_SPLIT = [col for col in range(56,56+7)]
grid_df = pd.concat([grid_df, df_parallelize_run(make_normal_lag,LAGS_SPLIT)], axis=1)
test_model = make_fast_test(grid_df)

features_columns = [col for col in list(grid_df) if col not in remove_features]
validation_df = grid_df[grid_df['d']>(END_TRAIN-28)].reset_index(drop=True)
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)

for col in features_columns:
    temp_df = validation_df.copy()
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        print(col, np.round(cur_score - base_score, 4))

del temp_df, validation_df

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[219]	training's rmse: 1.93686	valid_1's rmse: 1.86004
Standart RMSE 1.8600396804547472
release 0.0001
wm_yr_wk 0.0
sell_price 0.0009
tm_d 0.0069
tm_w 0.0025
tm_m -0.002
tm_y 0.0
tm_wm 0.0011
tm_dw 0.0536
tm_w_end 0.0046
sales_lag_56 0.0461
sales_lag_57 0.0268
sales_lag_58 0.0071
sales_lag_59 0.0157
sales_lag_60 0.0087
sales_lag_61 0.0141
sales_lag_62 0.0157


In [83]:
     
# Remove test features
# As we will compare performance with baseline model for now
keep_cols = [col for col in list(grid_df) if 'sales_lag_' not in col]
grid_df = grid_df[keep_cols]


# Results:
## Lags with 56 days shift (far away past) are not as important
## as nearest past lags
## and at some point will be just noise for our model

# Part 3.2 Implementation of PCA algorithm

In [84]:
########################### PCA #################################################

# The main question here - can we have almost same rmse boost with less features
# less dimensionality?

# Lets try PCA and make 7->3 dimensionality reduction

# PCA is "unsupervised" learning and with shifted target we can be sure
# that we have no Target leakage
from sklearn.decomposition import PCA

def make_pca(df, pca_col, n_days):
    print('PCA:', pca_col, n_days)
    
    # We don't need any other columns to make pca
    pca_df = df[[pca_col,'d',TARGET]]
    
    # If we are doing pca for other series "levels" 
    # we need to agg first
    if pca_col != 'id':
        merge_base = pca_df[[pca_col,'d']]
        pca_df = pca_df.groupby([pca_col,'d'])[TARGET].agg(['sum']).reset_index()
        pca_df[TARGET] = pca_df['sum']
        del pca_df['sum']
    
    # Min/Max scaling
    pca_df[TARGET] = pca_df[TARGET]/pca_df[TARGET].max()
    
    # Making "lag" in old way (not parallel)
    LAG_DAYS = [col for col in range(1,n_days+1)]
    format_s = '{}_pca_'+pca_col+str(n_days)+'_{}'
    pca_df = pca_df.assign(**{
            format_s.format(col, l): pca_df.groupby([pca_col])[col].transform(lambda x: x.shift(l))
            for l in LAG_DAYS
            for col in [TARGET]
        })
    
    pca_columns = list(pca_df)[3:]
    pca_df[pca_columns] = pca_df[pca_columns].fillna(0)
    pca = PCA(random_state=SEED)
    
    # You can use fit_transform here
    pca.fit(pca_df[pca_columns])
    pca_df[pca_columns] = pca.transform(pca_df[pca_columns])
    
    print(pca.explained_variance_ratio_)
    
    # we will keep only 3 most "valuable" columns/dimensions 
    keep_cols = pca_columns[:3]
    print('Columns to keep:', keep_cols)
    
    # If we are doing pca for other series "levels"
    # we need merge back our results to merge_base df
    # and only than return resulted df
    # I'll skip that step here
    
    return pca_df[keep_cols]

In [85]:

# Make PCA
grid_df = pd.concat([grid_df, make_pca(grid_df,'id',7)], axis=1)

# Make features test
test_model = make_fast_test(grid_df)

# Remove test features
# As we will compare performance with baseline model for now
keep_cols = [col for col in list(grid_df) if '_pca_' not in col]
grid_df = grid_df[keep_cols]


PCA: id 7
[0.71783945 0.06713417 0.05399105 0.04415797 0.04118687 0.03797171
 0.03771877]
Columns to keep: ['sales_pca_id7_1', 'sales_pca_id7_2', 'sales_pca_id7_3']
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[240]	training's rmse: 1.79727	valid_1's rmse: 1.77568


In [86]:

########################### Mean/std target encoding ################################################

# We will use these three columns for test
# (in combination with store_id)
icols = ['item_id','cat_id','dept_id']

# But we can use any other column or even multiple groups
# like these ones
#            'state_id',
#            'store_id',
#            'cat_id',
#            'dept_id',
#            ['state_id', 'cat_id'],
#            ['state_id', 'dept_id'],
#            ['store_id', 'cat_id'],
#            ['store_id', 'dept_id'],
#            'item_id',
#            ['item_id', 'state_id'],
#            ['item_id', 'store_id']

# There are several ways to do "mean" encoding
## K-fold scheme
## LOO (leave one out)
## Smoothed/regularized 
## Expanding mean
## etc 

# We will use simple target encoding
# by std and mean agg
for col in icols:
    print('Encoding', col)
    temp_df = grid_df[grid_df['d']<=(1913-28)] # to be sure we don't have leakage in our validation set
    
    temp_df = temp_df.groupby([col,'store_id']).agg({TARGET: ['std','mean']})
    joiner = '_'+col+'_encoding_'
    temp_df.columns = [joiner.join(col).strip() for col in temp_df.columns.values]
    temp_df = temp_df.reset_index()
    grid_df = grid_df.merge(temp_df, on=[col,'store_id'], how='left')
    del temp_df


Encoding item_id
Encoding cat_id
Encoding dept_id


In [87]:
# Make features test
test_model = make_fast_test(grid_df)

# Remove test features
keep_cols = [col for col in list(grid_df) if '_encoding_' not in col]
grid_df = grid_df[keep_cols]

# Bad thing that for some items  
# we are using past and future values.
# But we are looking for "categorical" similiarity
# on a "long run". So future here is not a big problem.


Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[271]	training's rmse: 1.8878	valid_1's rmse: 1.85755


In [88]:

########################### Last Non 0 sale ##########################

def find_last_sale(df,n_day):
    
    # Limit initial df
    ls_df = df[['id','d',TARGET]]
    
    # Convert target to binary
    ls_df['non_zero'] = (ls_df[TARGET]>0).astype(np.int8)
    
    # Make lags to prevent any leakage
    ls_df['non_zero_lag'] = ls_df.groupby(['id'])['non_zero'].transform(lambda x: x.shift(n_day).rolling(2000,1).sum()).fillna(-1)

    temp_df = ls_df[['id','d','non_zero_lag']].drop_duplicates(subset=['id','non_zero_lag'])
    temp_df.columns = ['id','d_min','non_zero_lag']

    ls_df = ls_df.merge(temp_df, on=['id','non_zero_lag'], how='left')
    ls_df['last_sale'] = ls_df['d'] - ls_df['d_min']

    return ls_df[['last_sale']]


In [89]:

# Find last non zero
grid_df = pd.concat([grid_df, find_last_sale(grid_df,1)], axis=1)

# Make features test
test_model = make_fast_test(grid_df)

# Remove test features
keep_cols = [col for col in list(grid_df) if 'last_sale' not in col]
grid_df = grid_df[keep_cols]

########################### Apply on grid_df ###################################################
# lets read grid from 
grid_df = pd.read_pickle(path+'grid_part_1.pkl')
#grid_df=grid_part_1.copy()
#grid_df['d']=grid_df['d'].str[2:6].astype(int) #need to change 'd' into numeric
grid_df[TARGET][grid_df['d']>(1913-28)] = np.nan
base_cols = list(grid_df)


Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[370]	training's rmse: 1.79999	valid_1's rmse: 1.78604


In [90]:

icols =  [
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
            ]

for col in icols:
    print('Encoding', col)
    col_name = '_'+'_'.join(col)+'_'
    grid_df['enc'+col_name+'mean'] = grid_df.groupby(col)[TARGET].transform('mean').astype(np.float16)
    grid_df['enc'+col_name+'std'] = grid_df.groupby(col)[TARGET].transform('std').astype(np.float16)

keep_cols = [col for col in list(grid_df) if col not in base_cols]
grid_df = grid_df[['id','d']+keep_cols]

# #################################################################################
print('Save Mean/Std encoding')
grid_df.to_pickle(path+'mean_encoding_df.pkl')

########################### Final list of new features
#################################################################################
grid_df.info()

Encoding ['state_id']
Encoding ['store_id']
Encoding ['cat_id']
Encoding ['dept_id']
Encoding ['state_id', 'cat_id']
Encoding ['state_id', 'dept_id']
Encoding ['store_id', 'cat_id']
Encoding ['store_id', 'dept_id']
Encoding ['item_id']
Encoding ['item_id', 'state_id']
Encoding ['item_id', 'store_id']
Save Mean/Std encoding
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4578794 entries, 0 to 4578793
Data columns (total 24 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   id                         category
 1   d                          int16   
 2   enc_state_id_mean          float16 
 3   enc_state_id_std           float16 
 4   enc_store_id_mean          float16 
 5   enc_store_id_std           float16 
 6   enc_cat_id_mean            float16 
 7   enc_cat_id_std             float16 
 8   enc_dept_id_mean           float16 
 9   enc_dept_id_std            float16 
 10  enc_state_id_cat_id_mean   float16 
 11  enc_state_id_cat_id_std 

# Part 4 Final forecasting
- part 4.1 Parallelization

In [91]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

# Part 4.2 get data

In [104]:
def get_data_by_store(store):
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)

    df = df[df['store_id']==store]
    
    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 
    
    df = pd.concat([df, df3], axis=1)
    del df3 
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('/content/drive/My Drive/GoogleColab/AUX_MODELS/test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]


# Part 4.3 Model Selection

In [93]:
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                }

# Part 4.4 Train the model

In [94]:
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913               # End day of our train set
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = True               # Use or not pretrained models


remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

# #PATHS for Features
# ORIGINAL = '../input/m5-forecasting-accuracy/'
# BASE     = '../input/m5-simple-fe/grid_part_1.pkl'
# PRICE    = '../input/m5-simple-fe/grid_part_2.pkl'
# CALENDAR = '../input/m5-simple-fe/grid_part_3.pkl'
# LAGS     = '../input/m5-lags-features/lags_df_28.pkl'
# MEAN_ENC = '../input/m5-custom-features/mean_encoding_df.pkl'

#PATHS for Features
ORIGINAL = '/content/drive/My Drive/GoogleColab/'
BASE     = path+'grid_part_1.pkl'
PRICE    = path+'grid_part_2.pkl'
CALENDAR = path+'grid_part_3.pkl'
LAGS     = path+'lags_df_28.pkl'
MEAN_ENC = path+'mean_encoding_df.pkl'

# AUX(pretrained) Models paths
AUX_MODELS = '/content/drive/My Drive/GoogleColab/AUX_MODELS/'


#STORES ids
#STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_evaluation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [95]:
if USE_AUX:
    lgb_params['n_estimators'] = 2
    
# Some 'logs' that can compare
#Train CA_1
#[100]	valid_0's rmse: 2.02289
#[200]	valid_0's rmse: 2.0017
#[300]	valid_0's rmse: 1.99239
#[400]	valid_0's rmse: 1.98471
#[500]	valid_0's rmse: 1.97923
#[600]	valid_0's rmse: 1.97284
#[700]	valid_0's rmse: 1.96763
#[800]	valid_0's rmse: 1.9624
#[900]	valid_0's rmse: 1.95673
#[1000]	valid_0's rmse: 1.95201
#[1100]	valid_0's rmse: 1.9476
#[1200]	valid_0's rmse: 1.9434
#[1300]	valid_0's rmse: 1.9392
#[1400]	valid_0's rmse: 1.93446


In [106]:
STORES_IDS=['CA_1'] # for testing purpose, only one store (CA_1) is used.

In [107]:
for store_id in STORES_IDS:
    print('Train', store_id)
    
    # Get grid for current store
    grid_df, features_columns = get_data_by_store(store_id)
    
    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validatio set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032

    # "To avoid any conversions, we should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                       label=grid_df[train_mask][TARGET])
    train_data.save_binary('train_data.bin')
    train_data = lgb.Dataset('train_data.bin')
    
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle(path+'test_'+store_id+'.pkl')
    del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
    model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(AUX_MODELS+model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
    !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

Train CA_1


# Part 4.5 Final forecasting

In [108]:
#USE_AUX=False
#Predict Model
all_preds = pd.DataFrame()

base_test = get_base_test()

main_time = time.time()

for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
        
    for store_id in STORES_IDS:

        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 
        if USE_AUX:
            model_path = AUX_MODELS + model_path
        
        estimator = pickle.load(open(model_path, 'rb'))
        #estimator=pi
        
        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])

    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
##########  0.25 min round |  0.25 min total |  594.59 day sales |
Predict | Day: 2
##########  0.25 min round |  0.49 min total |  595.16 day sales |
Predict | Day: 3
##########  0.25 min round |  0.74 min total |  594.98 day sales |
Predict | Day: 4
##########  0.25 min round |  0.98 min total |  594.78 day sales |
Predict | Day: 5
##########  0.25 min round |  1.23 min total |  596.91 day sales |
Predict | Day: 6
##########  0.24 min round |  1.47 min total |  603.99 day sales |
Predict | Day: 7
##########  0.25 min round |  1.72 min total |  603.96 day sales |
Predict | Day: 8
##########  0.25 min round |  1.96 min total |  594.47 day sales |
Predict | Day: 9
##########  0.25 min round |  2.21 min total |  594.46 day sales |
Predict | Day: 10
##########  0.24 min round |  2.45 min total |  594.35 day sales |
Predict | Day: 11
##########  0.25 min round |  2.70 min total |  594.18 day sales |
Predict | Day: 12
##########  0.25 min round |  2.95 min total |  596.01 d

Process ForkPoolWorker-48:
Process ForkPoolWorker-47:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "<ipython-input-104-6f8da7eef733>", line 50, in make_lag_roll
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


KeyboardInterrupt: ignored

  File "/usr/local/lib/python3.6/dist-packages/pandas/core/groupby/generic.py", line 494, in transform
    func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/groupby/generic.py", line 537, in _transform_general
    res = func(group, *args, **kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-104-6f8da7eef733>", line 50, in make_lag_roll
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/groupby/generic.py", line 494, in transform
    func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/groupby/generic.py", line 537, in _trans

***Since RAM got full, therefore it need to interrupt the execution while the model was predicting successfully upto days 17.***

In [110]:
all_preds

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17
0,HOBBIES_1_010_CA_1_validation,0.971863,0.971863,0.971863,0.969863,0.969863,0.979703,0.979703,0.969863,0.974684,0.974684,0.976968,0.976968,0.990237,0.990237,0.976968,0.976968,0.979966
1,HOBBIES_1_010_CA_1_validation,0.971863,0.971863,0.971863,0.969863,0.969863,0.979703,0.979703,0.969863,0.974684,0.974684,0.976968,0.976968,0.990237,0.990237,0.976968,0.976968,0.979966
2,HOBBIES_1_010_CA_1_validation,0.971863,0.971863,0.971863,0.969863,0.969863,0.979703,0.979703,0.969863,0.974684,0.974684,0.976968,0.976968,0.990237,0.990237,0.976968,0.979966,0.979966
3,HOBBIES_1_010_CA_1_validation,0.971863,0.971863,0.971863,0.969863,0.969863,0.979703,0.979703,0.969863,0.974684,0.974684,0.976968,0.976968,0.990237,0.990237,0.976968,0.979966,0.979966
4,HOBBIES_1_010_CA_1_validation,0.971863,0.971863,0.971863,0.969863,0.969863,0.979703,0.979703,0.969863,0.974684,0.974684,0.976968,0.976968,0.990237,0.990237,0.979966,0.976968,0.979966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38666235,FOODS_3_820_CA_1_validation,1.040883,1.040883,1.040883,1.040883,1.024632,1.042537,1.042537,1.024632,1.018203,1.018203,1.018203,1.018203,1.042537,1.042537,1.018203,1.011704,1.011704
38666236,FOODS_3_820_CA_1_validation,1.040883,1.040883,1.040883,1.040883,1.024632,1.042537,1.042537,1.024632,1.018203,1.018203,1.018203,1.018203,1.042537,1.042537,1.010994,1.018203,1.010994
38666237,FOODS_3_820_CA_1_validation,1.040883,1.040883,1.040883,1.040883,1.024632,1.042537,1.042537,1.024632,1.018203,1.018203,1.018203,1.018203,1.042537,1.042537,1.010994,1.018203,1.011704
38666238,FOODS_3_820_CA_1_validation,1.040883,1.040883,1.040883,1.040883,1.024632,1.042537,1.042537,1.024632,1.018203,1.018203,1.018203,1.018203,1.042537,1.042537,1.010994,1.011704,1.010994
