In [1]:
# Imports and useful definitions
import random
import numpy as np
import pandas as pd

from math import sqrt
from IPython.display import display
from sklearn.metrics import mean_squared_error

random.seed(156)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

exp_constant = 0.9

In [2]:
# Define the evaluation metric that we want to minimize
def rmse(predicted, actual):
    return sqrt(mean_squared_error(actual, predicted))

In [3]:
# Load training and validation data
x_train = pd.read_csv("temp_data/simple/x_train100.csv")
x_valid = pd.read_csv("temp_data/simple/x_valid100.csv")

In [4]:
# Demand data skewed to the right, use log to get closer to normal distribution
# np.log1p function handles cases where value == 0
x_train["log_demand"] = np.log1p(x_train.adj_demand)
x_valid["log_demand"] = np.log1p(x_valid.adj_demand)

In [5]:
# Compute global median
log_global_median = np.median(x_train["log_demand"])
display(str(log_global_median))
x_valid["log_med"] = log_global_median
display(str(rmse(x_valid["log_med"], x_valid["log_demand"])))

'1.38629436112'

'0.8646404207528307'

In [6]:
# Compute means of log-transformed demand (better than standard medians and means)
# Math explanation : https://www.kaggle.com/apapiu/grupo-bimbo-inventory-demand/mean-vs-medians-a-mathy-approach/discussion
logmeans_prod = (x_train.groupby(["product_id"]))["log_demand"].mean().to_dict()
logmeans_prod_store = (x_train.groupby(["product_id", "store_id"]))["log_demand"].mean().to_dict()
logmeans_prod_depot = (x_train.groupby(["product_id", "depot_id"]))["log_demand"].mean().to_dict()
logmeans_prod_chan = (x_train.groupby(["product_id", "channel_id"]))["log_demand"].mean().to_dict()
'''logmeans_prod_store_depot = (x_train.groupby(["product_id", "store_id", "depot_id"]))["log_demand"].mean().to_dict()
logmeans_prod_store_chan = (x_train.groupby(["product_id", "store_id", "channel_id"]))["log_demand"].mean().to_dict()'''
logmeans_prod_depot_store = (x_train.groupby(["product_id", "depot_id", "store_id"]))["log_demand"].mean().to_dict()
logmeans_prod_chan_store = (x_train.groupby(["product_id", "channel_id", "store_id"]))["log_demand"].mean().to_dict()
'''logmeans_prod_store_depot_chan = (x_train.groupby(["product_id", "store_id", "depot_id", 
                                                   "channel_id"]))["log_demand"].mean().to_dict()
logmeans_prod_store_chan_depot = (x_train.groupby(["product_id", "store_id", "channel_id", 
                                                   "depot_id"]))["log_demand"].mean().to_dict()
logmeans_prod_depot_store_chan = (x_train.groupby(["product_id", "depot_id", "store_id", 
                                                   "channel_id"]))["log_demand"].mean().to_dict()
logmeans_prod_depot_chan_store = (x_train.groupby(["product_id", "depot_id", "channel_id", 
                                                   "store_id"]))["log_demand"].mean().to_dict()
logmeans_prod_chan_depot_store = (x_train.groupby(["product_id", "channel_id", "depot_id", 
                                                   "store_id"]))["log_demand"].mean().to_dict()
logmeans_prod_chan_store_depot = (x_train.groupby(["product_id", "channel_id", "store_id", 
                                                   "depot_id"]))["log_demand"].mean().to_dict()'''

'logmeans_prod_store_depot_chan = (x_train.groupby(["product_id", "store_id", "depot_id", \n                                                   "channel_id"]))["log_demand"].mean().to_dict()\nlogmeans_prod_store_chan_depot = (x_train.groupby(["product_id", "store_id", "channel_id", \n                                                   "depot_id"]))["log_demand"].mean().to_dict()\nlogmeans_prod_depot_store_chan = (x_train.groupby(["product_id", "depot_id", "store_id", \n                                                   "channel_id"]))["log_demand"].mean().to_dict()\nlogmeans_prod_depot_chan_store = (x_train.groupby(["product_id", "depot_id", "channel_id", \n                                                   "store_id"]))["log_demand"].mean().to_dict()\nlogmeans_prod_chan_depot_store = (x_train.groupby(["product_id", "channel_id", "depot_id", \n                                                   "store_id"]))["log_demand"].mean().to_dict()\nlogmeans_prod_chan_store_depot = (x_train.groupby

In [None]:
'''def solution(key):    
    key = tuple(key)
    prod_id = key[0]
    channel_id = key[1]
    store_id = key[2]
    depot_id = key[3]
    if (prod_id, channel_id, store_id, depot_id) in logmeans_prod_chan_store_depot:
        val = logmeans_prod_chan_store_depot[prod_id, channel_id, store_id, depot_id]
    else:
        val = log_global_median        
    return val

x_valid["temp"] = x_valid[["product_id", "channel_id", "store_id", "depot_id"]].apply(lambda x:solution(x), axis = 1)
display(str(rmse(x_valid["temp"], x_valid["log_demand"])))'''

'def solution(key):    \n    key = tuple(key)\n    prod_id = key[0]\n    channel_id = key[1]\n    store_id = key[2]\n    depot_id = key[3]\n    if (prod_id, channel_id, store_id, depot_id) in logmeans_prod_chan_store_depot:\n        val = logmeans_prod_chan_store_depot[prod_id, channel_id, store_id, depot_id]\n    else:\n        val = log_global_median        \n    return val\n\nx_valid["temp"] = x_valid[["product_id", "channel_id", "store_id", "depot_id"]].apply(lambda x:solution(x), axis = 1)\ndisplay(str(rmse(x_valid["temp"], x_valid["log_demand"])))'

In [None]:
def solution(key):    
    key = tuple(key)
    product_id = key[0]
    channel_id = key[1]
    store_id = key[2]
    depot_id = key[3]
    if (product_id, depot_id, store_id) in logmeans_prod_depot_store:
        val = logmeans_prod_depot_store[product_id, depot_id, store_id]
    elif (product_id, store_id) in logmeans_prod_store:
        val = logmeans_prod_store[product_id, store_id]
    elif (product_id, channel_id, store_id) in logmeans_prod_chan_store:
        val = logmeans_prod_chan_store[product_id, channel_id, store_id]
    elif (product_id, depot_id) in logmeans_prod_depot:
        val = logmeans_prod_depot[product_id, depot_id]
    elif (product_id, channel_id) in logmeans_prod_chan:
        val = logmeans_prod_chan[product_id, channel_id]
    elif product_id in logmeans_prod:
        val = logmeans_prod[product_id]
    else:
        val = log_global_median
        
    return val

x_valid["log_guess"] = x_valid[["product_id", "channel_id", "store_id", "depot_id"]].apply(lambda x:solution(x), axis = 1)
display(str(rmse(x_valid["log_guess"], x_valid["log_demand"])))

# Use exponential to transform back from log form
x_valid["guess"] = np.expm1(x_valid["log_guess"]) * exp_constant

In [None]:
display(x_valid.head(10))