In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce

from tqdm import tqdm
from scipy import stats
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import pyarrow as pa
import pyarrow.parquet as pq

import warnings
warnings.filterwarnings('ignore')

def reduce_mem(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [2]:
input_data_dir = '../data/original'
save_data_dir = '../data/preprocessed'

In [3]:
sales = pd.read_csv(f'{input_data_dir}/sales_train_evaluation.csv')
sell_prices = pd.read_csv(f'{input_data_dir}/sell_prices.csv')
calendar = pd.read_csv(f'{input_data_dir}/calendar.csv')

# reduce memory usage
sales = reduce_mem(sales)
sell_prices = reduce_mem(sell_prices)
calendar = reduce_mem(calendar)

Mem. usage decreased to 96.13 Mb (78.8% reduction)
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Mem. usage decreased to  0.12 Mb (41.9% reduction)


In [4]:
calendar.date = pd.to_datetime(calendar.date)

# 
calendar['year_delta'] = 2016 - calendar.year

# cyclic encodings
calendar['quarter_sin'] = np.sin(2 * np.pi * calendar.date.dt.quarter/4.0)
calendar['quarter_cos'] = np.cos(2 * np.pi * calendar.date.dt.quarter/4.0)
calendar['month_sin'] = np.sin(2 * np.pi * calendar.month/12.0)
calendar['month_cos'] = np.cos(2 * np.pi * calendar.month/12.0)
calendar['day_sin'] = np.sin(2 * np.pi * calendar.date.dt.day/calendar.date.dt.days_in_month)
calendar['day_cos'] = np.cos(2 * np.pi * calendar.date.dt.day/calendar.date.dt.days_in_month)
calendar['weekday_sin'] = np.sin(2 * np.pi * calendar.wday/7.0)
calendar['weekday_cos'] = np.cos(2 * np.pi * calendar.wday/7.0)

# event count
calendar['event_count'] = calendar[['event_name_1', 'event_name_2']].notna().sum(axis=1)

# event encodings
event_names = ['event_name_1', 'event_name_2']
event_names_enc = ['event_name_1_enc', 'event_name_2_enc']
calendar[event_names_enc] = calendar[event_names]
event_names_encoder = ce.OrdinalEncoder(cols=event_names_enc)
event_names_encoder.fit(calendar)
event_names_encoder.mapping[1]['mapping'] = event_names_encoder.mapping[0]['mapping']
calendar = event_names_encoder.transform(calendar)
for col in event_names_enc:
    calendar[col] = calendar[col] - 1

event_types = ['event_type_1', 'event_type_2']
event_types_enc = ['event_type_1_enc', 'event_type_2_enc']
calendar[event_types_enc] = calendar[event_types]
event_type_encoder = ce.OrdinalEncoder(cols=event_types_enc)
event_type_encoder.fit(calendar)
event_type_encoder.mapping[1]['mapping'] = event_type_encoder.mapping[0]['mapping']
calendar = event_type_encoder.transform(calendar)
for col in event_types_enc:
    calendar[col] = calendar[col] - 1

#
calendar_df = calendar[['wm_yr_wk', 'd', 'snap_CA', 'snap_TX', 'snap_WI', 'year_delta',
                        'quarter_sin', 'quarter_cos', 'month_sin', 'month_cos', 
                        'day_sin', 'day_cos', 'weekday_sin', 'weekday_cos', 'event_count']
                        + event_names_enc 
                        + event_types_enc]

In [5]:
#
del sales['id']

# release
release = sell_prices.groupby(['item_id', 'store_id'])['wm_yr_wk'].min().reset_index()
release['release'] = 1
sell_prices = sell_prices.merge(right=release[['item_id', 'store_id', 'release']], on=['item_id', 'store_id'], how='left')
sell_prices['release'] = sell_prices['release'].fillna(0)

# relative sell price
sell_prices['sell_price_in_store'] = sell_prices['sell_price'] / sell_prices.groupby(['store_id', 'wm_yr_wk'])['sell_price'].transform('mean')

# diff
sell_prices['sell_price_diff'] = sell_prices['sell_price'].diff().fillna(0)

In [6]:
#
data_df = pd.melt(sales, id_vars=['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_vars=['d_'+str(i) for i in range(1, 1942)], value_name='sales')

# merge
data_df = data_df.merge(right=calendar_df[['d', 'wm_yr_wk']], on=['d'], how='left')
data_df = data_df.merge(right=sell_prices[['item_id', 'store_id', 'wm_yr_wk', 'sell_price', 'release', 'sell_price_in_store', 'sell_price_diff']], on=['item_id', 'store_id', 'wm_yr_wk'], how='left')

# fill nan 0 (before release)
data_df.release = data_df.release.fillna(0.0)
data_df.sell_price = data_df.sell_price.fillna(0.0)

# accumulate after release
data_df['release'] = data_df.groupby(['item_id', 'store_id'])['release'].transform(lambda x: (x > 0).cumsum())

# out of stock
def count_consecutive_zeros(group):
    zeros = group['sales'] == 0
    reset = ~zeros
    groups = reset.cumsum()
    result = zeros.groupby(groups).cumsum()
    result = result.where(zeros, 0)
    return result
data_df['out_of_stock'] = data_df.groupby(['item_id', 'store_id']).apply(lambda x: count_consecutive_zeros(x)).reset_index(drop=True)

#
del data_df['wm_yr_wk']

In [7]:
#
del calendar_df['wm_yr_wk']

#
calendar_df.to_csv(f'{save_data_dir}/calendar_df.csv', index=False)

In [8]:
levels = [
    [],                        # Level 1: Total
    ['state_id'],              # Level 2: State
    ['store_id'],              # Level 3: Store
    ['cat_id'],                # Level 4: Category
    ['dept_id'],               # Level 5: Department
    ['state_id', 'cat_id'],    # Level 6: State-Category
    ['state_id', 'dept_id'],   # Level 7: State-Department
    ['store_id', 'cat_id'],    # Level 8: Store-Category
    ['store_id', 'dept_id'],   # Level 9: Store-Department
    ['item_id'],               # Level 10: Item
    ['item_id', 'state_id'],   # Level 11: Item-State
    ['item_id', 'store_id']    # Level 12: Individual
]

agg_funcs = {
    'sales': [ 
        ('sales_sum', 'sum'), # 판매량 합계
        ('sales_mean', 'mean'), # 판매량 평균값
        ('sales_std', 'std'), # 판매량 표준편차
        ('sales_max', 'max'), # 판매량 최대값
        ('sales_min', 'min'), # 판매량 최소값
        ('sales_lag1', lambda x: x.shift(1).iloc[-1]), # 1일 전 판매량
        ('sales_lag7', lambda x: x.shift(7).iloc[-1]), # 7일 전 판매량
        ('sales_lag28', lambda x: x.shift(28).iloc[-1]), # 28일 전 판매량
        ('sales_rolling7_mean', lambda x: x.rolling(window=7, min_periods=1).mean().iloc[-1]), # 최근 7일 이동 평균
        ('sales_rolling28_mean', lambda x: x.rolling(window=28, min_periods=1).mean().iloc[-1]), # 최근 28일 이동 평균
        ('sales_trend', lambda x: x.reset_index(drop=True).corr(pd.Series(range(len(x))))), # 판매량 추세
     ],
    'release': [('release_mean', 'mean')], # 최초 판매량의 평균값
    'out_of_stock': [('out_of_stock_mean', 'mean')], # 재고 없음 평균값
    'sell_price': [
        ('sell_price_mean', 'mean'), # 평균 판매 가격
        ('sell_price_std', 'std'), # 판매 가격 표준편차
        ('sell_price_max', 'max'), # 판매 가격 최대값
        ('sell_price_min', 'min'), # 판매 가격 최소값
        ('sell_price_diff', 'mean'), # 판매 가격 변화량
        ('sell_price_trend', lambda x: x.reset_index(drop=True).corr(pd.Series(range(len(x))))), # 판매 가격 추세
    ],
    'sell_price_in_store': [('sell_price_in_store_mean', 'mean')], # 메징 안에서의 상대적 판매 가격
}

for i, level in tqdm(enumerate(levels), total=len(levels)):
    # aggregation
    agg_cols = level + ['d'] if level else ['d']
    agg_df = data_df.groupby(agg_cols).agg(**{
        new_col: (col, func) 
        for col, aggs in agg_funcs.items() 
        for new_col, func in aggs
    }).reset_index()

    # sorting
    agg_df['sort_key'] = agg_df['d'].str[2:].astype(int)
    agg_df = agg_df.sort_values(['sort_key'] + level)
    agg_df = agg_df.drop(columns=['sort_key']).reset_index(drop=True)
    
    agg_df.insert(0, 'level', i+1)
    agg_df.to_csv(f'{save_data_dir}/agg_df_level_{i+1}.csv', index=False)

100%|██████████| 12/12 [4:18:18<00:00, 1291.56s/it]  
