In [1]:
# Imports and useful definitions
import random
import numpy as np
import pandas as pd

from IPython.display import display

random.seed(156)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

exp_constant = 0.9

In [2]:
# Load training and test data
df_train = pd.read_csv("raw_data/train.csv",
                       dtype = {'Semana' : 'int8', 'Agencia_ID' :'int16', 'Canal_ID' : 'int8', 'Ruta_SAK' : 'int16',
                                'Cliente-ID' : 'int32', 'Producto_ID':'int32', 'Venta_uni_hoy':'int16', 'Venta_hoy': 'float32', 
                                'Dev_uni_proxima':'int32', 'Dev_proxima':'float32', 'Demanda_uni_equil':'int16'})
df_train.columns = ["week", "depot_id", "channel_id", "route_id", "store_id", "product_id", "sales_nb", "sales_pesos", 
                    "returns_nb", "returns_pesos", "adj_demand"]

df_test = pd.read_csv("raw_data/test.csv",
                       dtype = {'id' :'int32', 'Semana' : 'int8', 'Agencia_ID' :'int16', 'Canal_ID' : 'int8', 
                                'Ruta_SAK' : 'int16', 'Cliente-ID' : 'int32', 'Producto_ID':'int32'})
df_test.columns = ["id", "week", "depot_id", "channel_id", "route_id", "store_id", "product_id"]

In [3]:
# Demand data skewed to the right, use log to get closer to normal distribution
# np.log1p function handles cases where value == 0
df_train["log_demand"] = np.log1p(df_train.adj_demand)

In [4]:
# Compute global median
log_global_median = np.median(df_train["log_demand"])
display(str(log_global_median))

'1.38629'

In [5]:
# Compute means of log-transformed demand (better than standard medians and means)
# Math explanation : https://www.kaggle.com/apapiu/grupo-bimbo-inventory-demand/mean-vs-medians-a-mathy-approach/discussion
logmeans_prod = (df_train.groupby(["product_id"]))["log_demand"].mean().to_dict()
logmeans_prod_store = (df_train.groupby(["product_id", "store_id"]))["log_demand"].mean().to_dict()
logmeans_prod_depot = (df_train.groupby(["product_id", "depot_id"]))["log_demand"].mean().to_dict()
logmeans_prod_chan = (df_train.groupby(["product_id", "channel_id"]))["log_demand"].mean().to_dict()
logmeans_prod_depot_store = (df_train.groupby(["product_id", "depot_id", "store_id"]))["log_demand"].mean().to_dict()
logmeans_prod_chan_store = (df_train.groupby(["product_id", "channel_id", "store_id"]))["log_demand"].mean().to_dict()

In [6]:
def solution(key):    
    key = tuple(key)
    product_id = key[0]
    channel_id = key[1]
    store_id = key[2]
    depot_id = key[3]
    if (product_id, depot_id, store_id) in logmeans_prod_depot_store:
        val = logmeans_prod_depot_store[product_id, depot_id, store_id]
    elif (product_id, store_id) in logmeans_prod_store:
        val = logmeans_prod_store[product_id, store_id]
    elif (product_id, channel_id, store_id) in logmeans_prod_chan_store:
        val = logmeans_prod_chan_store[product_id, channel_id, store_id]
    elif (product_id, depot_id) in logmeans_prod_depot:
        val = logmeans_prod_depot[product_id, depot_id]
    elif (product_id, channel_id) in logmeans_prod_chan:
        val = logmeans_prod_chan[product_id, channel_id]
    elif product_id in logmeans_prod:
        val = logmeans_prod[product_id]
    else:
        val = log_global_median
        
    return val

df_test["Demanda_uni_equil"] = df_test[["product_id", "channel_id", "store_id", "depot_id"]].apply(lambda x:solution(x), axis = 1)

# Use exponential to transform back from log form
df_test["Demanda_uni_equil"] = round(np.expm1(df_test["Demanda_uni_equil"]) * exp_constant, 3)

In [7]:
# Create submission DF with the right formatting
df_submit = df_test[["id", "Demanda_uni_equil"]]
df_submit = df_submit.set_index("id")
df_submit.to_csv("submissions/log_means.csv")