**IMPORT PACKAGES | BUILD CUSTOM FUNCTIONS | SET PARAMETERS**

In [None]:
import numpy as np
import pandas as pd
import logging
import datetime
import lightgbm as lgb
import random
import os
import psutil
import argparse
import time
import warnings
import gc
import pickle
import math
import shutil
import math, decimal

from math import ceil
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')

In [None]:
random.seed(42)
np.random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

In [None]:
class Util(object):
    @staticmethod
    def set_seed(seed):
        random.seed(seed)
        np.random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        return

    @staticmethod
    def get_memory_usage():
        return np.round(psutil.Process(os.getpid()).memory_info()[0] / 2. ** 30, 2)

    @staticmethod
    def reduce_mem_usage(df, verbose=False):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024 ** 2
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        end_mem = df.memory_usage().sum() / 1024 ** 2
        if verbose:
            print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                    start_mem - end_mem) / start_mem))
        return df

    @staticmethod
    def merge_by_concat(df1, df2, merge_on):
        merged_gf = df1[merge_on]
        merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
        new_columns = [col for col in list(merged_gf) if col not in merge_on]
        df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
        return df1

**OPTION TO DOWNLOAD DATA**

In [None]:
# %%bash
!pip install dask_xgboost
# pip install kaggle
# export KAGGLE_USERNAME=jmiller558
# export KAGGLE_KEY=812fcd89e3a0fc00cb629bf2306b215e

# kaggle competitions download -c m5-forecasting-accuracy

# unzip -n m5-forecasting-accuracy -d m5-forecasting-accuracy
# rm -rf sample_data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dask_xgboost
  Downloading dask_xgboost-0.2.0-py2.py3-none-any.whl (14 kB)
Installing collected packages: dask-xgboost
Successfully installed dask-xgboost-0.2.0


**OPTION TO MOUNT DRIVE WITH DATA**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**PATHS FOR DATA**

In [None]:
#input paths for base data
trainpath = '/content/drive/MyDrive/Capstone/Models/m5-forecasting-accuracy/sales_train_evaluation.csv'
trainpath = '/content/drive/MyDrive/Capstone/Models/m5-forecasting-accuracy/sales_train_evaluation.csv'
pricepath = '/content/drive/MyDrive/Capstone/Models/m5-forecasting-accuracy/sell_prices.csv'
calpath = '/content/drive/MyDrive/Capstone/Models/m5-forecasting-accuracy/calendar.csv'
submissionpath = '/content/drive/MyDrive/Capstone/Models/m5-forecasting-accuracy/sample_submission.csv'

In [None]:
#input paths for Feature Engineered DFs
grid_base_path = '/content/drive/MyDrive/Capstone/Models/Weekly_lightgbm/grid_base_weekly'
calfeats_path = '/content/drive/MyDrive/Capstone/Models/Weekly_lightgbm/calfeats_weekly'
pricefeats_path = '/content/drive/MyDrive/Capstone/Models/Weekly_lightgbm/pricefeats_weekly'
encoding_path = '/content/drive/MyDrive/Capstone/Models/Weekly_lightgbm/encodingfeats_weekly'
lagfeats_path = '/content/drive/MyDrive/Capstone/Models/Weekly_lightgbm/weekly_lagfeats_'

**LOAD BASE COMPETITION DATA**

In [None]:
def load_data():
    # self.log.info('load_data')
    train_df = pd.read_csv(trainpath)
    # self.log.info('train_df.shape', train_df.shape)
    prices_df = pd.read_csv(pricepath)
    # self.log.info('prices_df.shape', prices_df.shape)
    calendar_df = pd.read_csv(calpath)
    # self.log.info('calendar_df.shape', calendar_df.shape)
    submission_df = pd.read_csv(submissionpath)
    # self.log.info('submission_df.shape', submission_df.shape)

    return train_df, prices_df, calendar_df, submission_df

train_df, prices_df, calendar_df, submission_df = load_data()

**Set Variables**

In [None]:
target = 'demand'
prediction_horizon = 12
start_train_week_x = 1
end_train_week_x = 264

remove_features = ['id', 'state_id', 'store_id', 'wm_yr_wk','week', target]

**BASE FEATURE ENGINEERING (ONLY NEEDED FIRST TIME)**



*   BUILD GRID_BASE





In [None]:
index_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
grid_df = pd.melt(train_df, id_vars=index_columns, var_name='d', value_name=target)

In [None]:
release_df = prices_df.groupby(['store_id', 'item_id'])['wm_yr_wk'].agg(['min']).reset_index()
release_df.columns = ['store_id', 'item_id', 'release']
grid_df = Util.merge_by_concat(grid_df, release_df, ['store_id', 'item_id'])
del release_df
grid_df = Util.merge_by_concat(grid_df, calendar_df[['wm_yr_wk', 'd']], ['d'])
grid_df = grid_df.reset_index(drop=True)
grid_df['release'] = grid_df['release'] - grid_df['release'].min()
grid_df['release'] = grid_df['release'].astype(np.int16)

In [None]:
grid_df = grid_df[grid_df.wm_yr_wk<=11616]
weekly_df = grid_df[['id','wm_yr_wk','demand']]
weekly_df = weekly_df.groupby(['id','wm_yr_wk']).sum().reset_index()
grid_to_merge = grid_df[index_columns+['release']].drop_duplicates()
weekly_df = weekly_df.merge(grid_to_merge, on=['id'], how='left')[index_columns+['release','wm_yr_wk','demand']]
week_df = pd.DataFrame(data={'wm_yr_wk':weekly_df.wm_yr_wk.unique(),'week':range(len(weekly_df.wm_yr_wk.unique()))})
weekly_df = weekly_df.merge(week_df, on=['wm_yr_wk'], how='left')

In [None]:
for col in index_columns:
        weekly_df[col] = weekly_df[col].astype('category')

In [None]:
weekly_df.to_pickle(grid_base_path)

In [None]:
del grid_df,weekly_df,grid_to_merge,week_df

*   BUILD CAL FEATURES



In [None]:
cal_base = calendar_df[['date','wm_yr_wk']].groupby(['wm_yr_wk']).agg(['min']).reset_index()
cal_base.columns = ['wm_yr_wk', 'wk_start_date']

In [None]:
cal_full = pd.concat([calendar_df,pd.get_dummies(calendar_df[['event_name_1','event_type_1','event_name_2','event_type_2']])],axis=1)

In [None]:
cal_full = cal_full.drop(['date','weekday', 'wday', 'month', 'year', 'd','event_name_1','event_type_1','event_name_2','event_type_2',],axis=1)

In [None]:
cal_final = cal_base.merge(cal_full.groupby('wm_yr_wk').sum().reset_index(), on=['wm_yr_wk'], how='left')

In [None]:
dec = decimal.Decimal

def get_moon_phase(d):  # 0=new, 4=full; 4 days/phase
        diff = datetime.datetime.strptime(d, '%Y-%m-%d') - datetime.datetime(2001, 1, 1)
        days = dec(diff.days) + (dec(diff.seconds) / dec(86400))
        lunations = dec("0.20439731") + (days * dec("0.03386319269"))
        phase_index = math.floor((lunations % dec(1) * dec(8)) + dec('0.5'))
        return int(phase_index) & 7
        
cal_final['moon'] = cal_final.wk_start_date.apply(get_moon_phase)

In [None]:
icols = ['snap_CA',
          'snap_TX',
          'snap_WI']
for col in icols:
  cal_final[col] = cal_final[col].astype('category')

In [None]:
cal_final['wk_start_date'] = pd.to_datetime(cal_final['wk_start_date'])

cal_final['tm_d'] = cal_final['wk_start_date'].dt.day.astype(np.int8)
cal_final['tm_w'] = cal_final['wk_start_date'].dt.week.astype(np.int8)
cal_final['tm_m'] = cal_final['wk_start_date'].dt.month.astype(np.int8)
cal_final['tm_y'] = cal_final['wk_start_date'].dt.year
cal_final['tm_y'] = (cal_final['tm_y'] - cal_final['tm_y'].min()).astype(np.int8)
cal_final['tm_wm'] = cal_final['tm_d'].apply(lambda x: ceil(x / 7)).astype(np.int8)

del cal_final['wk_start_date'],cal_final['tm_d']

In [None]:
grid_df = pd.read_pickle(grid_base_path)[['id','wm_yr_wk']]
calfeats_final = grid_df.merge(cal_final, on=['wm_yr_wk'], how='left')

In [None]:
calfeats_final.to_pickle(calfeats_path)
del cal_final,cal_full,cal_base,grid_df,calfeats_final

*   BUILD PRICING FEATURES



In [None]:
calendar_prices = calendar_df[['wm_yr_wk', 'month', 'year']]
calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
prices_df = prices_df.merge(calendar_prices[['wm_yr_wk', 'month', 'year']], on=['wm_yr_wk'], how='left')
del calendar_prices

grid_df = pd.read_pickle(grid_base_path)

prices_df = prices_df[prices_df['wm_yr_wk']<=grid_df['wm_yr_wk'].max()]

prices_df['price_max'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('max')
prices_df['price_min'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('min')
prices_df['price_std'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('std')
prices_df['price_mean'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('mean')
prices_df['price_norm'] = prices_df['sell_price'] / prices_df['price_max']
prices_df['price_nunique'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('nunique')
prices_df['item_nunique'] = prices_df.groupby(['store_id', 'sell_price'])['item_id'].transform('nunique')

prices_df['price_momentum'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id'])[
    'sell_price'].transform(lambda x: x.shift(1))
prices_df['price_momentum_m'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id', 'month'])[
    'sell_price'].transform('mean')
prices_df['price_momentum_y'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id', 'year'])[
    'sell_price'].transform('mean')

prices_df['sell_price_cent'] = [math.modf(p)[0] for p in prices_df['sell_price']]
prices_df['price_max_cent'] = [math.modf(p)[0] for p in prices_df['price_max']]
prices_df['price_min_cent'] = [math.modf(p)[0] for p in prices_df['price_min']]

del prices_df['month'], prices_df['year']

In [None]:
grid_df = pd.read_pickle(grid_base_path)
original_columns = list(grid_df)
pricefeats_df = grid_df.merge(prices_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
keep_columns = [col for col in list(pricefeats_df) if col not in original_columns]
pricefeats_df = pricefeats_df[['id', 'wm_yr_wk'] + keep_columns]
pricefeats_df = Util.reduce_mem_usage(pricefeats_df)
del prices_df

In [None]:
pricefeats_df.to_pickle(pricefeats_path)

In [None]:
del pricefeats_df
del grid_df

NameError: ignored

*   CREATE ENCODING FEATURES



In [None]:
encoding_df = pd.read_pickle(grid_base_path)
encoding_df.loc[encoding_df['week']>(end_train_week_x),target] = np.nan
                       
base_cols = list(encoding_df)

icols = [['cat_id'],
    ['dept_id'],
    ['item_id']]

for col in icols:
    col_name = '_' + '_'.join(col) + '_'
    encoding_df['enc' + col_name + 'mean'] = encoding_df.groupby(col)[target].transform('mean').astype(
        np.float16)
    encoding_df['enc' + col_name + 'std'] = encoding_df.groupby(col)[target].transform('std').astype(
        np.float16)

keep_cols = [col for col in list(encoding_df) if col not in base_cols]
encoding_df = encoding_df[['id', 'wm_yr_wk'] + keep_cols]

encoding_df.to_pickle(encoding_path)

In [None]:
del encoding_df

**CREATE LAG FEATURES**

In [None]:
for prediction_horizon in [3,6,9,12]:
    num_lag_week_list = []
    num_lag_week = 12
    for col in range(prediction_horizon, prediction_horizon + num_lag_week):
        num_lag_week_list.append(col)
    num_rolling_week_list = [3, 6, 12, 24, 48]

    lagfeats_df = pd.read_pickle(grid_base_path)
    lagfeats_df = lagfeats_df[['id', 'week',target]]
    lagfeats_df.loc[lagfeats_df['week']>end_train_week_x,target] = np.nan

    lagfeats_df = lagfeats_df.assign(**{
        '{}_lag_{}'.format(col, l): lagfeats_df.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in num_lag_week_list
        for col in [target]
    })

    for col in list(lagfeats_df):
        if 'lag' in col:
            lagfeats_df[col] = lagfeats_df[col].astype(np.float16)

    for num_rolling_week in num_rolling_week_list:
        lagfeats_df['rolling_mean_' + str(num_rolling_week)] = lagfeats_df.groupby(['id'])[target].transform(
            lambda x: x.shift(prediction_horizon).rolling(num_rolling_week).mean()).astype(np.float16)
        lagfeats_df['rolling_std_' + str(num_rolling_week)] = lagfeats_df.groupby(['id'])[target].transform(
            lambda x: x.shift(prediction_horizon).rolling(num_rolling_week).std()).astype(np.float16)

    lagfeats_df.to_pickle(lagfeats_path+str(prediction_horizon))

**CREATE FULL DATASET BY STORE**

In [None]:
def load_df(store_id,end_train_week_x,prediction_horizon):
    
    grid_base = pd.read_pickle(grid_base_path)
    

    pricefeats = pd.read_pickle(pricefeats_path)
    calfeats = pd.read_pickle(calfeats_path)
    encodingfeats = pd.read_pickle(encoding_path)
    
    full_df = pd.concat([grid_base,
                         pricefeats.iloc[:, 2:],
                         calfeats.iloc[:, 2:],
                         encodingfeats.iloc[:, 2:]],
                        axis=1)

    del grid_base,pricefeats,calfeats,encodingfeats

    if store_id != 'all':
            full_df = full_df[full_df['store_id'] == store_id]

    full_df = full_df[full_df['week']<=(end_train_week_x+prediction_horizon)]

    lagfeats = pd.read_pickle(lagfeats_path+str(prediction_horizon))
    lagfeats = lagfeats.iloc[:, 3:]
    lagfeats= lagfeats[lagfeats.index.isin(full_df.index)]

    full_df = pd.concat([full_df, lagfeats], axis=1)
    del lagfeats

    enable_features = [col for col in list(full_df) if col not in remove_features]
    full_df = full_df[['id', 'week', target] + enable_features]

    #full_df = full_df[full_df['week'] >= start_train_week_x].reset_index(drop=True)

    return full_df, enable_features

**RUN MODEL**

In [None]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'tweedie',
    'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'subsample_freq': 1,
    'learning_rate': 0.03,
    'num_leaves': 2 ** 11 - 1,
    'min_data_in_leaf': 2 ** 12 - 1,
    'feature_fraction': 0.5,
    'max_bin': 100,
    'n_estimators': 1400,
    'boost_from_average': False,
}

store_id_set_list = list(train_df['store_id'].unique())

for store_index, store_id in enumerate(store_id_set_list):
    for prediction_horizon in [3,6,9,12]:
      save_name = '/content/drive/MyDrive/Capstone/Models/Weekly_lightgbm/weekly_lightgbm/' + str(store_id)+'-'+str(prediction_horizon)+'-'+'.csv'
    
      grid_df,enable_features = load_df(store_id,end_train_week_x,prediction_horizon)

      x_train = grid_df[(grid_df['week'] >= start_train_week_x) & (grid_df['week'] <= end_train_week_x)]
      y_train = x_train[target]
      print(y_train.head())
      x_val = grid_df[(grid_df['week'] > (end_train_week_x - prediction_horizon)) & (grid_df['week'] <= end_train_week_x)]
      y_val = x_val[target]
      test = grid_df[grid_df['week'] > end_train_week_x]
      
      train_data = lgb.Dataset(x_train[enable_features],
                                      label=y_train)
      
      val_data = lgb.Dataset(x_val[enable_features],
                                      label=y_val)
      
      del grid_df, x_train, y_train
      gc.collect()
      
      estimator = lgb.train(lgb_params,train_data,valid_sets = [val_data], verbose_eval = 100)
      
      val_pred = estimator.predict(x_val[enable_features])
      val_score = np.sqrt(mean_squared_error(val_pred, y_val))
      print(f'Our val rmse score is {val_score}')

      y_pred = estimator.predict(test[enable_features])
      test[target] = y_pred
      print(y_pred)

      predictions = test[['id', 'week', target]]
      print(predictions)
      predictions = pd.pivot(predictions, index = 'id', columns = 'week', values = target).reset_index()

      predictions.to_csv(save_name,index=False)

1386     7
1387    12
1388     6
1389     6
1390     7
Name: demand, dtype: int64
[100]	valid_0's rmse: 10.344
[200]	valid_0's rmse: 8.90512
[300]	valid_0's rmse: 8.76908
[400]	valid_0's rmse: 8.71102
[500]	valid_0's rmse: 8.64387
[600]	valid_0's rmse: 8.58386
[700]	valid_0's rmse: 8.49066
[800]	valid_0's rmse: 8.44113
[900]	valid_0's rmse: 8.37158
[1000]	valid_0's rmse: 8.32323
[1100]	valid_0's rmse: 8.26932
[1200]	valid_0's rmse: 8.19554
[1300]	valid_0's rmse: 8.1775
[1400]	valid_0's rmse: 8.13107
Our val rmse score is 8.131070016533705
[4.40507666 4.45816883 4.14997417 ... 1.76756604 1.55807197 1.67722264]
                                      id  week    demand
1650         FOODS_1_001_TX_2_evaluation   265  4.405077
1651         FOODS_1_001_TX_2_evaluation   266  4.458169
1652         FOODS_1_001_TX_2_evaluation   267  4.149974
4420         FOODS_1_002_TX_2_evaluation   265  1.429923
4421         FOODS_1_002_TX_2_evaluation   266  1.293326
...                                  ... 