In [None]:
from  datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
#LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
import numpy as np, pandas as pd
import matplotlib.pyplot as plot 
import sklearn
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

import joblib

In [None]:
def load_df(prev_last, pred_period, is_train=True):
    
    cal_cat_cols = ['weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    sales_cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    price_cat_cols = ['store_id', 'item_id']
    
    calendars = pd.read_csv('/Users/hshan/Downloads/M5/calendar.csv')
    sales = pd.read_csv('/Users/hshan/Downloads/M5/sales_train_validation.csv')
    prices = pd.read_csv('/Users/hshan/Downloads/M5/sell_prices.csv')
    
    if not is_train:
        for i in range((prev_last+1), (prev_last+pred_period+1)):
            f_string = f'd_{i}'
            sales[f_string] = pd.Series()
    
    ind_var = ['id'] + sales_cat_cols
    val_var = [col for col in sales.columns if col.startswith('d_')]
    df = pd.melt(sales, id_vars = ind_var, value_vars = val_var, var_name = 'd', value_name='sales')
    df = df.merge(calendars, on = 'd', copy = False)
    df = df.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    cat_cols = cal_cat_cols + sales_cat_cols
    label_encoder = LabelEncoder()
    for col in cat_cols:
        df[col] = df[col].fillna('').astype('category')
        df[col] = label_encoder.fit_transform(df[col])

    unused_cols = ['wm_yr_wk', 'weekday','store_id']
    
    df.drop(unused_cols, inplace = True, axis = 1)
    
    return (df)

def lag_features(df):
    '''max lag should not be exceeding 57 in this case'''
    num = [1,7,28]
    lags = num
    windows = num
    lag_cols = [f'lag_{lag}' for lag in lags]
    

    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df[['id','sales']].groupby('id')['sales'].shift(lag)
        
    for window in windows:
        for lag, lag_col in zip(lags, lag_cols):
            mean_col = f'mean_{lag}_{window}'
            df[mean_col] = df[['id',lag_col]].groupby('id')[lag_col].transform(lambda x: x.rolling(window).mean())
    
    return (df)

def submission(result_df):
    '''result_df is the resulted dataframe from for looping in predicting, 
    it includes 56 samples before the first day of the prediction d_1914'''
    sub_df = df.loc[pd.to_datetime(df.date)>= first_day,['id','d','sales']]
    val_df = sub_df.loc[(pd.to_datetime(df.date)>= first_day)& (pd.to_datetime(df.date)< first_day+timedelta(days=28))]
    eval_df = sub_df.loc[pd.to_datetime(df.date)>= (first_day+timedelta(days=28))]
    
    col_v= list(val_df['d'].unique())
    col_e= list(eval_df['d'].unique())
    
    f_cols =[]
    for i in range(1, 29):
        f_col = f'F{i}'
        f_cols.append(f_col)
    
    val_df = val_df.set_index(["id", "d" ]).unstack()['sales'][col_v].reset_index()
    eval_df = eval_df.set_index(["id", "d" ]).unstack()['sales'][col_e].reset_index()
    
    val_df.columns=['id'] + f_cols
    eval_df.columns=['id'] + f_cols
    
    for i in range(0,len(eval_df)):
        eval_df['id'][i] = eval_df['id'][i].replace('validation','evaluation')
        final_sub = pd.concat([val_df, eval_df])
        
    return (final_sub)


In [None]:
pred_period = 56
train_start = 1
train_last = 1913
test_start = train_last + 1
test_last  = train_last + pred_period
pred_start = test_last + 1
pred_last = test_last + pred_period
seed = 1231

removed_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday']
categorical_cols = ['wday','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'] + \
    ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

In [None]:
df = load_df(train_last, pred_period, is_train=True)
df = lag_features(df)

df.dropna(inplace=True)
y_target = df['sales']

df_cols = list(df.columns)
x_features =[]
for feature in df_cols:
    if feature not in removed_cols:
        x_features.append(feature)

In [None]:
param = {
    "objective" : "poisson",
    "metric" :"rmse",
    'boosting_type' : 'gbdt',
    "force_row_wise" : True,
    "learning_rate" : 0.075,
    #"sub_feature" : 0.8,
    "sub_row" : 0.75,
    "bagging_freq" : 1,
    "lambda_l2" : 1.5,
    "lambda_l1" : 0.5,
    "nthread" : 5,
    "metric": "rmse",
    'verbosity': -1,
    'num_iterations' : 2000,
    'num_leaves': 200,
    "min_data_in_leaf": 200,
}



In [None]:
lgbm_train_data = lgb.Dataset(df[x_features], label = df['sales'])
# model = lgb.train(param, train_set=lgbm_train_data, valid_sets=lgbm_valid_data, categorical_feature=categorical_cols, 
#             verbose_eval=100, early_stopping_rounds=120)
# model = lgb.train(param, train_set=lgbm_train_data, categorical_feature=categorical_cols)
model = lgb.train(param, train_set=lgbm_train_data)

In [None]:
filename = '/Users/hshan/Kaggle/model.sav'
joblib.dump(model, filename)

In [None]:
# model = joblib.load('/Users/hshan/Kaggle/model_no_val.sav')
df = load_df(train_last, pred_period, is_train=False)

In [None]:
first_day = datetime(2016, 4, 25)
for delta in range(0, pred_period):
    predict_day = first_day + timedelta(days = delta)
    predict_df = df.loc[(pd.to_datetime(df.date) <= predict_day) & (pd.to_datetime(df.date) >= predict_day - timedelta(days=57))]
    
    predict_df = lag_features(predict_df)
    
    predict_df = predict_df.loc[pd.to_datetime(predict_df.date) == predict_day]
    predict_df= predict_df[x_features]
    result = model.predict(predict_df)
    
    df.loc[pd.to_datetime(df.date)==predict_day,'sales'] = result
    del predict_df
    print(predict_day)

In [None]:
submission(df).to_csv('/Users/hshan/Downloads/M5/model_sub.csv', index=False)