# XGboost regressor with lag-1 feature (recursive forecasting)

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb

In [47]:
train_merged = pd.read_csv('../data/train/train_merged.csv')
test_merged = pd.read_csv('../data/test_merged.csv')

## New feature: lag-1
Generally, in forecasting models, lagged values are an important feature. In this case I will define lagged value as the target at the last center/meal occurence. That means that the lag can be higher than a week since the center/meal combinations do not occure regularly over the 145 weeks in the train set.

In [48]:
train_merged['lag_1'] = train_merged.groupby(['center_id','meal_id'])['num_orders'].shift()

In [49]:
train_merged['lag_1'].isna().sum()

3597

We can see that we have about 3597 NaNs in the new lag-1 column which corresponds to the number of center/meal combinations in our train set. The target at the first occurence of each combination does not have a lagged value. Let's impute those NaNs with the mean target (over the train set) of the corresponding center/meal combination.

In [50]:
mean_num_orders_per_combination = train_merged.groupby(['center_id','meal_id'],as_index=False)['num_orders'].mean()
mean_num_orders_per_combination.rename(columns={'num_orders':'mean_num_orders'},inplace=True)
mean_num_orders_per_combination.tail()

Unnamed: 0,center_id,meal_id,mean_num_orders
3592,186,2707,206.924138
3593,186,2760,60.375887
3594,186,2826,271.391608
3595,186,2867,37.717172
3596,186,2956,32.4


In [54]:
train_merged = pd.merge(train_merged,mean_num_orders_per_combination, on=['center_id','meal_id'])
train_merged['lag_1'].fillna(train_merged['mean_num_orders'],inplace=True)

In [55]:
X = train_merged[['center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured',
       'center_type', 'category', 'cuisine', 'calendar_week','lag_1']]

In [56]:
y = train_merged['num_orders']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [58]:
categories = ['emailer_for_promotion','homepage_featured',
              'center_type','category', 'cuisine']

In [59]:
trans = ColumnTransformer([
    ('num categories', OneHotEncoder(drop='first',sparse=False), categories),],remainder='passthrough')

In [60]:
model_pipe = Pipeline([
    ('feature_engineering', trans), 
    ('model', xgb.XGBRegressor(max_depth = 10, eta=0.2))   
                    ])
model_pipe.fit(X_train, y_train)

Pipeline(steps=[('feature_engineering',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num categories',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['emailer_for_promotion',
                                                   'homepage_featured',
                                                   'center_type', 'category',
                                                   'cuisine'])])),
                ('model',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1,...gpu_id=-1,
                              importance_type='gain',
                              interaction_constraints='',
                              learning_rate=0.

In [61]:
y_train_pred = model_pipe.predict(X_train)
y_test_pred = model_pipe.predict(X_test)

In [67]:
len(y_train_pred[y_train_pred<0]), len(y_test_pred[y_test_pred<0])

(67, 31)

We can see that we have some negative prediction that have to be set to zero

In [68]:
y_train_pred[y_train_pred<0] = 0
y_test_pred[y_test_pred<0] = 0

In [69]:
print('train score:',np.sqrt(mean_squared_log_error(y_train, y_train_pred)))
print('test score:',np.sqrt(mean_squared_log_error(y_test, y_test_pred)))

train score: 0.473414510382005
test score: 0.5091725484435257


## Challenge: there are no lagged values in the test set (submission set)
Approach: recursive forecasting. Each week:
- fill lagged values
- predict target values
- save predictions as lagged values for next week

### Initialize lagged values for the test set (submission set) as last target value for each center/meal combination in the train set

In [71]:
last_num_orders = train_merged.groupby(['center_id','meal_id'],as_index=False)['num_orders'].last()
last_num_orders.rename(columns={'num_orders':'lag_1'},inplace=True)
last_num_orders.tail()

Unnamed: 0,center_id,meal_id,lag_1
3592,186,2707,175
3593,186,2760,96
3594,186,2826,162
3595,186,2867,28
3596,186,2956,122


In [72]:
final_test = pd.DataFrame()
last_num_orders = train_merged.groupby(['center_id','meal_id'],as_index=False)['num_orders'].last()
last_num_orders.rename(columns={'num_orders':'lag_1'},inplace=True)

for week in range(146,156):
    
    test_week = test_merged[test_merged['week']==week]
    test_week = pd.merge(test_week,last_num_orders,on=['center_id','meal_id'],how='left')
    
    X_week = test_week[['center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 
       'center_type', 'category', 'cuisine', 'calendar_week','lag_1']]
    
    y_week = model_pipe.predict(X_week)
    y_week[y_week<0] = 0
    test_week['num_orders'] = y_week
    
    
    last_num_orders = test_week[['center_id','meal_id','num_orders']]
    last_num_orders.rename(columns={'num_orders':'lag_1'},inplace=True)
    
    final_test = pd.concat([final_test,test_week])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: htt

## Submission

In [73]:
sub_4 = final_test[['id','num_orders']]
sub_4

Unnamed: 0,id,num_orders
0,1028232,94.242218
1,1127204,53.437073
2,1212707,129.723343
3,1082698,58.305077
4,1400926,43.842861
...,...,...
3173,1250239,43.826385
3174,1039516,32.728634
3175,1158107,250.157654
3176,1444235,230.818848


## Submission score: 59.40