In [1]:
from  datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
#LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
import numpy as np, pandas as pd
import matplotlib.pyplot as plot 
import sklearn
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

import joblib

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def load_df(prev_last, pred_period, is_train=True, stores='CA_1'):
    
    cal_cat_cols = ['weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    sales_cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    price_cat_cols = ['store_id', 'item_id']
    
    calendars = pd.read_csv('/Users/hshan/Downloads/M5/calendar.csv')
    sales = pd.read_csv('/Users/hshan/Downloads/M5/sales_train_validation.csv')
    prices = pd.read_csv('/Users/hshan/Downloads/M5/sell_prices.csv')
    
    if not is_train:
        for i in range((prev_last+1), (prev_last+pred_period+1)):
            f_string = f'd_{i}'
            sales[f_string] = pd.Series()
    
    ind_var = ['id'] + sales_cat_cols
    val_var = [col for col in sales.columns if col.startswith('d_')]
    df = pd.melt(sales, id_vars = ind_var, value_vars = val_var, var_name = 'd', value_name='sales')
    df = df.merge(calendars, on = 'd', copy = False)
    df = df.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    df = df[df['store_id']==store]
    
    cat_cols = cal_cat_cols + sales_cat_cols
    label_encoder = LabelEncoder()
    for col in cat_cols:
        df[col] = df[col].fillna('').astype('category')
        df[col] = label_encoder.fit_transform(df[col])

    unused_cols = ['wm_yr_wk', 'weekday','store_id']
    
    df.drop(unused_cols, inplace = True, axis = 1)
    
    return (df)

def lag_features(df):
    '''max lag should not be exceeding 57 in this case'''
    num = [1,7,28]
    lags = num
    windows = num
    lag_cols = [f'lag_{lag}' for lag in lags]
    

    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df[['id','sales']].groupby('id')['sales'].shift(lag)
        
    for window in windows:
        for lag, lag_col in zip(lags, lag_cols):
            mean_col = f'mean_{lag}_{window}'
            df[mean_col] = df[['id',lag_col]].groupby('id')[lag_col].transform(lambda x: x.rolling(window).mean())
    
    return (df)

def submission(result_df):
    '''result_df is the resulted dataframe from for looping in predicting, 
    it includes 56 samples before the first day of the prediction d_1914'''
    sub_df = df.loc[pd.to_datetime(df.date)>= first_day,['id','d','sales']]
    val_df = sub_df.loc[(pd.to_datetime(df.date)>= first_day)& (pd.to_datetime(df.date)< first_day+timedelta(days=28))]
    eval_df = sub_df.loc[pd.to_datetime(df.date)>= (first_day+timedelta(days=28))]
    
    col_v= list(val_df['d'].unique())
    col_e= list(eval_df['d'].unique())
    
    f_cols =[]
    for i in range(1, 29):
        f_col = f'F{i}'
        f_cols.append(f_col)
    
    val_df = val_df.set_index(["id", "d" ]).unstack()['sales'][col_v].reset_index()
    eval_df = eval_df.set_index(["id", "d" ]).unstack()['sales'][col_e].reset_index()
    
    val_df.columns=['id'] + f_cols
    eval_df.columns=['id'] + f_cols
    
    for i in range(0,len(eval_df)):
        eval_df['id'][i] = eval_df['id'][i].replace('validation','evaluation')
        final_sub = pd.concat([val_df, eval_df])
        
    return (final_sub)

In [20]:
pred_period = 56
train_start = 1
train_last = 1913
test_start = train_last + 1
test_last  = train_last + pred_period
pred_start = test_last + 1
pred_last = test_last + pred_period
seed = 1231

removed_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday']
store_sub = ['CA_1','CA_2','CA_3','CA_4','TX_1','TX_2','TX_3','WI_1','WI_2','WI_3']
categorical_cols = ['wday','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'] + \
    ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

In [13]:
param = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'num_iterations': 2000,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

In [None]:
#sub = pd.DataFrame()

for store in store_sub:
    df = load_df(train_last, pred_period, is_train=True, stores=store)
    df = lag_features(df)

    df.dropna(inplace=True)
    y_target = df['sales']

    df_cols = list(df.columns)
    x_features =[]
    for feature in df_cols:
        if feature not in removed_cols:
            x_features.append(feature)

    lgbm_train_data = lgb.Dataset(df[x_features], label = df['sales'])
    model = lgb.train(param, train_set=lgbm_train_data)

    df = load_df(train_last, pred_period, is_train=False, stores=store)

    first_day = datetime(2016, 4, 25)
    for delta in range(0, pred_period):
        predict_day = first_day + timedelta(days = delta)
        predict_df = df.loc[(pd.to_datetime(df.date) <= predict_day) & (pd.to_datetime(df.date) >= predict_day - timedelta(days=57))]
    
        predict_df = lag_features(predict_df)
    
        predict_df = predict_df.loc[pd.to_datetime(predict_df.date) == predict_day]
        predict_df= predict_df[x_features]
        result = model.predict(predict_df)
    
        df.loc[pd.to_datetime(df.date)==predict_day,'sales'] = result
        del predict_df

    sub_df = submission(df)
    sub = pd.concat([sub_df])
    
sub.shape

In [21]:
sub.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.861528,0.829079,0.885889,0.869121,1.132691,0.983988,1.136245,0.943548,1.014477,...,1.196196,1.319166,1.378998,0.9987,0.884017,0.868214,0.870822,1.143662,1.310316,1.13569
1,FOODS_1_002_CA_1_validation,0.413219,0.454041,0.4094,0.45465,0.584919,0.488027,0.503477,0.4822,0.562531,...,0.602412,0.742162,0.606611,0.539938,0.485999,0.512591,0.524009,0.60642,0.612464,0.520054
2,FOODS_1_003_CA_1_validation,0.815611,0.769896,0.769102,0.793459,0.839925,0.797297,0.831561,0.691332,0.733296,...,0.780199,0.961058,0.835676,0.676955,0.634748,0.65445,0.736228,0.787281,0.903966,0.806371
3,FOODS_1_004_CA_1_validation,0.018121,0.904354,0.975617,1.34774,1.87397,1.82436,2.09452,1.270402,1.477433,...,1.686644,1.813961,1.764954,1.258206,1.10385,1.243125,1.30674,1.486079,1.756218,1.818134
4,FOODS_1_005_CA_1_validation,1.066217,1.064439,1.263986,1.253811,1.473062,1.385474,1.517981,1.405904,1.334819,...,1.303113,1.331599,1.250319,1.040171,1.06708,1.039098,1.151356,1.26625,1.357884,1.067834
