In [5]:
"""
Problem Statement
Your client is a meal delivery company which operates in multiple cities. They have various fulfillment centers in these cities for dispatching meal orders to their customers. 
The client wants you to help these centers with demand forecasting for upcoming weeks so that these centers will plan the stock of raw materials accordingly.
"""

"""
Imports required for implementation.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import gc
import catboost as cb
from sklearn.metrics import mean_squared_log_error,mean_squared_error
from sklearn.model_selection import train_test_split,KFold
%matplotlib inline
warnings.filterwarnings("ignore")

In [6]:
def show_all(df,nrow,ncol):
    with pd.option_context("display.max_rows",nrow,"display.max_columns",ncol):
        display(df)

In [12]:
"""
Loading all datasets provided
"""
train_path = "Datasets/train/"
test_path = "Datasets/test/"
sub_path = "Datasets/sample/"

fci_df = pd.read_csv(f"{train_path}/fulfilment_center_info.csv")
mi_df = pd.read_csv(f"{train_path}/meal_info.csv")
train_df = pd.read_csv(f"{train_path}/train.csv")
test_df = pd.read_csv(f"{test_path}/test.csv")
sub_df = pd.read_csv(f"{sub_path}/sample_submission.csv")

In [13]:
"""
EDA of all datasets
"""

"""
Fulfilment center info dataset
"""

fci_df.head()

Unnamed: 0,center_id,city_code,region_code,center_type,op_area
0,11,679,56,TYPE_A,3.7
1,13,590,56,TYPE_B,6.7
2,124,590,56,TYPE_C,4.0
3,66,648,34,TYPE_A,4.1
4,94,632,34,TYPE_C,3.6


In [14]:
print("Shape: ",fci_df.shape)

Shape:  (77, 5)


In [15]:
#Checking for null values
fci_df.isnull().sum()

center_id      0
city_code      0
region_code    0
center_type    0
op_area        0
dtype: int64

In [16]:
"""
Meal info dataset
"""

mi_df.head()

Unnamed: 0,meal_id,category,cuisine
0,1885,Beverages,Thai
1,1993,Beverages,Thai
2,2539,Beverages,Thai
3,1248,Beverages,Indian
4,2631,Beverages,Indian


In [17]:
print("Shape: ",mi_df.shape)

Shape:  (51, 3)


In [19]:
#Checking for null values
mi_df.isnull().sum()

meal_id     0
category    0
cuisine     0
dtype: int64

In [20]:
"""
Training dataset
"""

train_df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders
0,1379560,1,55,1885,136.83,152.29,0,0,177
1,1466964,1,55,1993,136.83,135.83,0,0,270
2,1346989,1,55,2539,134.86,135.86,0,0,189
3,1338232,1,55,2139,339.5,437.53,0,0,54
4,1448490,1,55,2631,243.5,242.5,0,0,40


In [21]:
print("Shape: ",train_df.shape)

Shape:  (456548, 9)


In [22]:
#Checking for null values
train_df.isnull().sum()

id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
num_orders               0
dtype: int64

In [23]:
"""
Testing dataset
"""

test_df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured
0,1028232,146,55,1885,158.11,159.11,0,0
1,1127204,146,55,1993,160.11,159.11,0,0
2,1212707,146,55,2539,157.14,159.14,0,0
3,1082698,146,55,2631,162.02,162.02,0,0
4,1400926,146,55,1248,163.93,163.93,0,0


In [24]:
print("Shape: ",test_df.shape)

Shape:  (32573, 8)


In [25]:
#Checking for null values
test_df.isnull().sum()

id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
dtype: int64

In [26]:
"""
Combining the fulfilment center dataset and meal info dataset into one dataset
"""

train_df = pd.merge(train_df , fci_df , on="center_id")
test_df = pd.merge(test_df , fci_df , on="center_id")

train_df = pd.merge(train_df , mi_df , on="meal_id")
test_df = pd.merge(test_df , mi_df , on="meal_id")

In [28]:
"""
New train dataset
"""
train_df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine
0,1379560,1,55,1885,136.83,152.29,0,0,177,647,56,TYPE_C,2.0,Beverages,Thai
1,1018704,2,55,1885,135.83,152.29,0,0,323,647,56,TYPE_C,2.0,Beverages,Thai
2,1196273,3,55,1885,132.92,133.92,0,0,96,647,56,TYPE_C,2.0,Beverages,Thai
3,1116527,4,55,1885,135.86,134.86,0,0,163,647,56,TYPE_C,2.0,Beverages,Thai
4,1343872,5,55,1885,146.5,147.5,0,0,215,647,56,TYPE_C,2.0,Beverages,Thai


In [29]:
"""
Labelling train and test data
"""

train_df["train_or_test"] = "train"
test_df["train_or_test"] = "test"

In [30]:
"""
Finding natural logarithmic value of the num_orders for easier calculation to the model
"""
train_df["num_orders"] = np.log1p(train_df["num_orders"])

In [31]:
"""
Combining train and test dataset into one dataset
"""
all_df = train_df.append(test_df).reset_index(drop=True)[train_df.columns]

In [32]:
"""
Sorting the values on the basis of centers,meals and weeks
"""
all_df = all_df.sort_values(["center_id" , "meal_id" , "week"]).reset_index(drop = True)

"""
Logarithmic values of checkout and base_price for the model to work efficiently
"""
all_df["checkout_price"] = np.log1p(all_df["checkout_price"])
all_df["base_price"] = np.log1p(all_df["base_price"])
all_df["discount_on_base"] = (all_df["base_price"] - all_df["checkout_price"])/all_df["base_price"]

In [33]:
"""
Displaying all dataset
"""
all_df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine,train_or_test,discount_on_base
0,1436842,1,10,1062,5.206147,5.206147,0,0,6.763885,590,56,TYPE_B,6.3,Beverages,Italian,train,0.0
1,1205013,2,10,1062,5.21689,5.211451,0,0,6.663133,590,56,TYPE_B,6.3,Beverages,Italian,train,-0.001044
2,1447751,3,10,1062,5.2223,5.211451,0,0,6.747587,590,56,TYPE_B,6.3,Beverages,Italian,train,-0.002082
3,1014968,4,10,1062,5.211451,5.21689,0,0,7.092574,590,56,TYPE_B,6.3,Beverages,Italian,train,0.001043
4,1003563,5,10,1062,5.217053,5.206147,0,0,6.865891,590,56,TYPE_B,6.3,Beverages,Italian,train,-0.002095


In [34]:
all_df["neg_discount"] = (all_df["discount_on_base"] < 0).astype(int)
"""
Calculating difference in costs in consecutive orders
"""
all_df["price_last_curr_diff"] = (all_df["checkout_price"].shift(1) - all_df["checkout_price"]).fillna(1)/all_df["checkout_price"].shift(1).fillna(1)

In [42]:
"""
Grouping by center_id and meal_id and weeks
"""
for _, r in all_df.groupby(['center_id', 'meal_id'])['week'].first().reset_index().iterrows():
    all_df.loc[(all_df['center_id']==r['center_id']) & (all_df['meal_id']==r['meal_id']) & (all_df['week']==r['week']), 'price_last_curr_diff']=all_df[(all_df['center_id']==r['center_id']) & (all_df['meal_id']==r['meal_id']) & (all_df['week']!=r['week'])]['price_last_curr_diff'].mean()

In [43]:
all_df["price_last_curr_diff"] = all_df["price_last_curr_diff"].fillna(0)
all_df["price_increase"] = (all_df["price_last_curr_diff"] < 0).astype(int)

In [44]:
"""
Printing all values after all operations required on the data
"""
all_df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine,train_or_test,discount_on_base,neg_discount,price_last_curr_diff,price_increase
0,1436842,1,10,1062,5.206147,5.206147,0,0,6.763885,590,56,TYPE_B,6.3,Beverages,Italian,train,0.0,0,8.6e-05,0
1,1205013,2,10,1062,5.21689,5.211451,0,0,6.663133,590,56,TYPE_B,6.3,Beverages,Italian,train,-0.001044,1,-0.002064,1
2,1447751,3,10,1062,5.2223,5.211451,0,0,6.747587,590,56,TYPE_B,6.3,Beverages,Italian,train,-0.002082,1,-0.001037,1
3,1014968,4,10,1062,5.211451,5.21689,0,0,7.092574,590,56,TYPE_B,6.3,Beverages,Italian,train,0.001043,0,0.002077,0
4,1003563,5,10,1062,5.217053,5.206147,0,0,6.865891,590,56,TYPE_B,6.3,Beverages,Italian,train,-0.002095,1,-0.001075,1


In [48]:
"""
Forecasting method used:
catBooster regressor method
Uses gradient boosting on values for prediction
"""

"""
Monthwise aggregation of the sales values
"""

def create_sales_agg_monthwise_features(df , gpby_cols , target_col , agg_funcs):
    gpby = df.groupby(gpby_cols)
    newdf = df[gpby_cols].drop_duplicates().reset_index(drop = True)
    for agg_name , agg_func in agg_funcs.items():
        aggdf = gpby[target_col].agg(agg_func).reset_index()
        aggdf.rename(columns = {target_col:target_col+"_"+agg_name} , inplace = True)
        newdf = newdf.merge(aggdf , on=gpby_cols , how = "left")
    
    return newdf

"""
Creating features from previous sales values
"""


# Creating sales lag features

def create_sales_lag_feats(df , gpby_cols , target_col , lags):
    gpby = df.groupby(gpby_cols)
    for lag in lags:
        df["_".join([target_col , "lag" ,str(lag)])] = gpby[target_col].shift(lag).values + np.random.normal(scale = 1.6 , size = len(df))
    
    return df

# Creating sales rolling mean features
def create_sales_rmean_feats(df, gpby_cols, target_col, windows, min_periods=2, 
                             shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmean', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).mean().values +\
            np.random.normal(scale=1.6, size=(len(df),))
    return df


# Creating sales rolling median features
def create_sales_rmed_feats(df, gpby_cols, target_col, windows, min_periods=2, 
                            shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmed', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).median().values +\
            np.random.normal(scale=1.6, size=(len(df),))
    return df

# Creating sales exponentially weighted mean features
def create_sales_ewm_feats(df, gpby_cols, target_col, alpha=[0.9], shift=[1]):
    gpby = df.groupby(gpby_cols)
    for a in alpha:
        for s in shift:
            df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] = \
                gpby[target_col].shift(s).ewm(alpha=a).mean().values
    return df

def one_hot_encoder(df , ohe_cols):
    print("Creating OHE features\nOld df shape: {}".format(df.shape))
    df = pd.get_dummies(df , columns = ohe_cols)
    print("New df shape : {}".format(df.shape))

    return df

In [49]:
"""
Creating num_order lag and ewm
"""

all_df = create_sales_lag_feats( all_df , gpby_cols = ["center_id" , "meal_id"] , target_col = "num_orders" , lags = [10,11,12])

all_df = create_sales_ewm_feats( all_df , gpby_cols = ["center_id" , "meal_id"] , target_col = "num_orders" , alpha = [0.5] , shift = [10,11,12,13,14,15])

In [50]:
"""
Shape of new all df
"""
all_df.shape

(489121, 29)

In [51]:
"""
Displaying the dataset
"""
show_all (all_df , 5 ,None)

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,category,cuisine,train_or_test,discount_on_base,neg_discount,price_last_curr_diff,price_increase,num_orders_lag_10,num_orders_lag_11,num_orders_lag_12,num_orders_lag_10_ewm_0.5,num_orders_lag_11_ewm_0.5,num_orders_lag_12_ewm_0.5,num_orders_lag_13_ewm_0.5,num_orders_lag_14_ewm_0.5,num_orders_lag_15_ewm_0.5
0,1436842,1,10,1062,5.206147,5.206147,0,0,6.763885,590,56,TYPE_B,6.3,Beverages,Italian,train,0.000000,0,0.000086,0,,,,,,,,,
1,1205013,2,10,1062,5.216890,5.211451,0,0,6.663133,590,56,TYPE_B,6.3,Beverages,Italian,train,-0.001044,1,-0.002064,1,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
489119,1429037,115,186,2956,6.366522,6.369952,0,0,2.772589,649,34,TYPE_A,3.4,Fish,Continental,train,0.000539,0,0.000539,0,1.242330,3.456474,4.548076,3.097631,3.556204,3.398836,3.430377,3.564916,3.392163
489120,1285049,116,186,2956,5.680275,6.368239,0,0,4.812184,649,34,TYPE_A,3.4,Fish,Continental,train,0.108030,0,0.107790,0,2.276765,2.876333,5.289352,3.405601,3.097631,3.556204,3.398836,3.430377,3.564916


In [52]:
"""
New Train and test dataset
"""
train = all_df.loc[all_df["train_or_test"] == "train" , :]
test = all_df.loc[all_df["train_or_test"] == "test" , :]

In [53]:
"""
Train shape and test shape
"""
print("Train shape : {}\nTest shape : {}".format(train.shape , test.shape))

Train shape : (456548, 29)
Test shape : (32573, 29)


In [54]:
"""
Checking for null values
"""
train.isnull().sum()

id                               0
week                             0
center_id                        0
meal_id                          0
checkout_price                   0
base_price                       0
emailer_for_promotion            0
homepage_featured                0
num_orders                       0
city_code                        0
region_code                      0
center_type                      0
op_area                          0
category                         0
cuisine                          0
train_or_test                    0
discount_on_base                 0
neg_discount                     0
price_last_curr_diff             0
price_increase                   0
num_orders_lag_10            35843
num_orders_lag_11            39412
num_orders_lag_12            42980
num_orders_lag_10_ewm_0.5       10
num_orders_lag_11_ewm_0.5       11
num_orders_lag_12_ewm_0.5       12
num_orders_lag_13_ewm_0.5       13
num_orders_lag_14_ewm_0.5       14
num_orders_lag_15_ew

In [56]:
"""
Implementing catboost regressor
"""

avoid = ["id" , "num_orders" , "train_or_test" , "checkout_price" , "base_price" , "city_code" , "region_code" , "center_type" , ""]

features = [col for col in test.columns if col not in avoid]
features

['week',
 'center_id',
 'meal_id',
 'emailer_for_promotion',
 'homepage_featured',
 'op_area',
 'category',
 'cuisine',
 'discount_on_base',
 'neg_discount',
 'price_last_curr_diff',
 'price_increase',
 'num_orders_lag_10',
 'num_orders_lag_11',
 'num_orders_lag_12',
 'num_orders_lag_10_ewm_0.5',
 'num_orders_lag_11_ewm_0.5',
 'num_orders_lag_12_ewm_0.5',
 'num_orders_lag_13_ewm_0.5',
 'num_orders_lag_14_ewm_0.5',
 'num_orders_lag_15_ewm_0.5']

In [62]:
"""
Features index to be included while fitting in the model
"""
cat_index = [1,2,3,4,6,7,9,11]

cat_regressor = cb.CatBoostRegressor(iterations = 200,
                                     learning_rate = 0.5,
                                     depth = 5,
                                     l2_leaf_reg = 10,
                                     loss_function = "RMSE" ,
                                     random_seed = 2020)

cat_regressor.fit( X = train[features],
                   y = train["num_orders"] ,
                   cat_features = cat_index ,
                   verbose = True)

0:	learn: 0.8799703	total: 412ms	remaining: 1m 21s
1:	learn: 0.7333530	total: 876ms	remaining: 1m 26s
2:	learn: 0.6634167	total: 1.57s	remaining: 1m 43s
3:	learn: 0.6330906	total: 2.31s	remaining: 1m 53s
4:	learn: 0.6158935	total: 2.69s	remaining: 1m 44s
5:	learn: 0.6052067	total: 3.17s	remaining: 1m 42s
6:	learn: 0.5986738	total: 3.57s	remaining: 1m 38s
7:	learn: 0.5936541	total: 3.9s	remaining: 1m 33s
8:	learn: 0.5810082	total: 4.29s	remaining: 1m 31s
9:	learn: 0.5775897	total: 4.68s	remaining: 1m 29s
10:	learn: 0.5747898	total: 5.16s	remaining: 1m 28s
11:	learn: 0.5725972	total: 5.49s	remaining: 1m 25s
12:	learn: 0.5696887	total: 5.87s	remaining: 1m 24s
13:	learn: 0.5675570	total: 6.27s	remaining: 1m 23s
14:	learn: 0.5642399	total: 6.69s	remaining: 1m 22s
15:	learn: 0.5612122	total: 7.08s	remaining: 1m 21s
16:	learn: 0.5595167	total: 7.37s	remaining: 1m 19s
17:	learn: 0.5574564	total: 7.69s	remaining: 1m 17s
18:	learn: 0.5551582	total: 8.1s	remaining: 1m 17s
19:	learn: 0.5529260	tot

<catboost.core.CatBoostRegressor at 0x1c1a9b84e48>

In [63]:
"""
Prediction of the food demand
"""
pred = cat_regressor.predict(test[features])
pred

array([6.89979613, 6.86790046, 6.75314512, ..., 2.9874359 , 2.96157175,
       3.32487581])

In [60]:
pred = (np.exp(pred) - 1)

In [61]:
"""
Making the submissions csv
"""
sub_df = pd.DataFrame( columns = ["id" , "num_orders"])
sub_df["id"] = test["id"]
sub_df["num_orders"] = pred

"""
Converting the dataset into csv
"""
sub_df.to_csv(f"{sub_path}/submissions.csv" , index = False)