In [2]:
import pandas as pd
import numpy as np
import category_encoders as ce


import warnings
warnings.filterwarnings('ignore')

In [3]:
input_data_dir = '../data/original'

In [4]:
train_data = pd.read_csv(f'{input_data_dir}/sales_train_evaluation.csv')
sell_prices = pd.read_csv(f'{input_data_dir}/sell_prices.csv')
calendar = pd.read_csv(f'{input_data_dir}/calendar.csv')

In [None]:
calendar.date = pd.to_datetime(calendar.date)

# 
calendar['relative_year'] = 2016 - calendar.year

# cyclic encodings
### weekend
calendar['quarter_sin'] = np.sin(2 * np.pi * calendar.date.dt.quarter/4.0)
calendar['quarter_cos'] = np.cos(2 * np.pi * calendar.date.dt.quarter/4.0)
calendar['month_sin'] = np.sin(2 * np.pi * calendar.month/12.0)
calendar['month_cos'] = np.cos(2 * np.pi * calendar.month/12.0)
calendar['day_sin'] = np.sin(2 * np.pi * calendar.date.dt.day/calendar.date.dt.days_in_month)
calendar['day_cos'] = np.cos(2 * np.pi * calendar.date.dt.day/calendar.date.dt.days_in_month)
calendar['weekday_sin'] = np.sin(2 * np.pi * calendar.wday/7.0)
calendar['weekday_cos'] = np.cos(2 * np.pi * calendar.wday/7.0)

# event encodings
### event_type
cal_label = ['event_name_1', 'event_name_2']
cal_label_encoded_cols = ['event_name_1_enc', 'event_name_2_enc']
calendar[cal_label_encoded_cols] = calendar[cal_label]
cal_label_encoder = ce.OrdinalEncoder(cols=cal_label_encoded_cols)
cal_label_encoder.fit(calendar)
cal_label_encoder.mapping[1]['mapping'] = cal_label_encoder.mapping[0]['mapping']
calendar = cal_label_encoder.transform(calendar)
for col in cal_label_encoded_cols:
    calendar[col] = calendar[col] - 1

# 
calendar_df = calendar[['wm_yr_wk', 'd', 'snap_CA', 'snap_TX', 'snap_WI', 'relative_year',
                        'quarter_sin', 'quarter_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin', 'weekday_cos']
                        + cal_label_encoded_cols]

# reduce memory usage
calendar_df[['snap_CA', 'snap_TX', 'snap_WI', 'relative_year']] = calendar_df[
    ['snap_CA', 'snap_TX', 'snap_WI', 'relative_year']].astype(np.int8)
calendar_df[cal_label_encoded_cols] = calendar_df[cal_label_encoded_cols].astype(np.int16)

In [6]:
train_data.id = train_data.id.str[:-11]

label = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
label_encoded_cols = [str(i)+'_enc' for i in label]
train_data[label_encoded_cols] = train_data[label]
label_encoder = ce.OrdinalEncoder(cols=[str(i)+'_enc' for i in label])
label_encoder.fit(train_data)
train_data = label_encoder.transform(train_data)
for col in label_encoded_cols:
    train_data[col] = train_data[col] - 1

In [7]:
sell_prices['id'] = sell_prices['item_id'] + '_' + sell_prices['store_id']

# first sell
first_sell = sell_prices.groupby('id')['wm_yr_wk'].min().reset_index()
first_sell['first_sell'] = 1
sell_prices = sell_prices.merge(right=first_sell[['id', 'first_sell']], on='id', how='left')
sell_prices['first_sell'] = sell_prices['first_sell'].fillna(0)

In [8]:
data_df = pd.melt(train_data, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
                                        'item_id_enc', 'dept_id_enc', 'cat_id_enc', 'store_id_enc', 'state_id_enc'],
                    var_name='d', value_vars=['d_'+str(i) for i in range(1, 1942)], value_name='sales')

# reduce memory usage
data_df[label_encoded_cols] = data_df[label_encoded_cols].astype(np.int16)

In [None]:
# merge
data_df = data_df.merge(right=calendar_df[['d', 'wm_yr_wk']], on=['d'], how='left')
data_df = data_df.merge(right=sell_prices[['id', 'wm_yr_wk', 'first_sell', 'sell_price']], on=['id', 'wm_yr_wk'], how='left')

# fill nan 0 (before release)
data_df.first_sell = data_df.first_sell.fillna(0.0)
data_df.sell_price = data_df.sell_price.fillna(0.0)

# accumulate first sell
data_df['first_sell'] = data_df.groupby('id')['first_sell'].transform(lambda x: (x > 0).cumsum())

# out of stock
def count_consecutive_zeros(group):
    zeros = group['sales'] == 0
    reset = ~zeros
    groups = reset.cumsum()
    result = zeros.groupby(groups).cumsum()
    result = result.where(zeros, 0)
    return result
data_df['out_of_stock'] = data_df.groupby('id').apply(lambda x: count_consecutive_zeros(x)).reset_index(level=0, drop=True)

# remove category columns
del data_df['wm_yr_wk']
del data_df['item_id']
del data_df['dept_id']
del data_df['cat_id']
del data_df['store_id']
del data_df['state_id']

In [9]:
num_samples = data_df.id.nunique()
num_timesteps = data_df.d.nunique()
data_df = data_df.set_index(['id', 'd'])

ids = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
enc_dec_feats = ['sell_price'] + label_encoded_cols