In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
infos = pd.read_csv('../../data/infos.csv', sep = '|')

In [3]:
items = pd.read_csv('../../data/items.csv', sep = '|')

In [4]:
orders = pd.read_csv('../../data/orders.csv', sep = '|', parse_dates=['time'])

## - Creating the structure

In [5]:
df = orders.copy()

In [6]:
df['weekpair'] = (df.time.dt.dayofyear + 1) // 14 - 13

In [7]:
npairs = df.weekpair.nunique()

In [8]:
n_items = items['itemID'].nunique()
print('total number of items:', n_items)
print('expected number of instances:', n_items * npairs)

total number of items: 10463
expected number of instances: 136019


In [9]:
mi = pd.MultiIndex.from_product([range(-npairs, 0 + 1), items['itemID']], names=['weekpair', 'itemID'])
data_temp = pd.DataFrame(index = mi)

In [10]:
data_temp = data_temp.join(df.groupby(['weekpair', 'itemID'])[['order']].sum(), how = 'left')

In [11]:
data_temp.fillna(0, inplace = True)

In [12]:
data_temp.groupby('itemID').count().min()

order    14
dtype: int64

## - Creating features

In [13]:
def gbagg(data, group_cols, targeted_cols, out_names, function, as_index = False):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = []
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval)
        new_feat.append(output)

        # go to next group
        i = j
    
    idx = data.groupby(group_cols).size().index # this is actually fast...
    out_df = pd.DataFrame(new_feat, columns = out_names, index = idx)
        
    if not as_index:
        out_df.reset_index(inplace = True)
        
    return out_df

In [14]:
def gbtransf(data, group_cols, targeted_cols, out_names, function, params = dict()):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = np.zeros((len(data), len(out_names)))
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval, **params)
        new_feat[i:j] = output

        # go to next group
        i = j
    
    out_df = pd.DataFrame(new_feat, columns = out_names, index = data.index)
        
    return out_df

In [15]:
def shift_and_2n_window(x, ws):
#     out = pd.DataFrame(x)
#     out = out.shift()
#     out = out.rolling(2 ** n).mean()

    shifted = np.zeros_like(x) # output
    shifted[1:] = x[:-1] # shift
    out = np.zeros_like(x, dtype = float)
    
    # rolling mean
    total = shifted[:ws].sum()
    out[ws - 1] = total / ws
    for i in range(ws, len(out)):
        total = total - shifted[i - ws] + shifted[i]
        out[i] = total / ws
    out[:ws] = np.NaN # maybe ws -1 should be NaN as well for receiving one NaN value when ws > 1
    # out[0] = np.NaN # this is always NaN for a shift of 1
    return out

In [16]:
data = data_temp.reset_index()
data = pd.merge(data, items, on = 'itemID')

In [22]:
data.loc[data["weekpair"] == 0, "order"] = 1 # facilita para fazer feature que detecta item novo

In [23]:
data.sort_values(['itemID', 'weekpair'], inplace = True)

In [24]:
# gbtransf(data, ['itemID', 'weekpair'], ['order'], ['out'], lambda x : np.ones_like(x))

In [25]:
shift_and_2n_window(np.array([1 , 2, 3, 4, 5, 6]), 2 ** 1)

array([nan, nan, 1.5, 2.5, 3.5, 4.5])

In [26]:
features = [('itemID', 'item')]

In [27]:
for f, name in features:
    print(f)
    new_feature_block = pd.DataFrame()
    for n in range(3):
        new_f = gbtransf(data, ['itemID'], ['order'], ['out'], shift_and_2n_window, {'ws' : 2 ** n})
        new_feature_block['%s_%d' % (name, 2 ** n)] = new_f['out']
#     data = pd.merge(data, new_feature_block.reset_index(), on = [f, 'weekpair'])
    data = pd.concat([data, new_feature_block], axis =  1)

itemID


In [28]:
data["diff"] = data.groupby("order")["order"].shift()
data["diff"] = data.groupby("order")["diff"].diff()
data["diff2"] = data.groupby("order")["diff"].diff()

In [29]:
data.count() # the larger the window, more NaN are expected

weekpair                  146482
itemID                    146482
order                     146482
brand                     146482
manufacturer              146482
customerRating            146482
category1                 146482
category2                 146482
category3                 146482
recommendedRetailPrice    146482
item_1                    136019
item_2                    125556
item_4                    104630
diff                      144694
diff2                     144113
dtype: int64

In [30]:
def dist2firstvalueLeak(x):
    out = np.zeros_like(x, dtype = float)
    for i in range(len(x)):
        if x[i] != 0:
            out[i] = 1
            break
#         else:
#             out[i] = -9999
        
    return out

In [31]:
dist2firstvalueLeak(np.array([0 , 0, 3, 0, 5, 6]))

array([0., 0., 1., 0., 0., 0.])

In [32]:
data.sort_values(['itemID', 'weekpair'], inplace = True)

In [33]:
data['dist2firstvalueLeak'] = gbtransf(data, ['itemID'], ['order'], ['out'], dist2firstvalueLeak)['out']
# data['dist2lastpeak'] = gbtransf(data, ['itemID'], ['order'], ['out'], dist2lastpeak)['out']

In [34]:
the_cat = "manufacturer"

In [35]:
sla = data.groupby(["weekpair", the_cat])["dist2firstvalueLeak"].sum().reset_index()

In [36]:
sla = sla.rename(columns={"dist2firstvalueLeak" : "leak_cat3"})

In [37]:
# acho que mudou nada... opa, mudou sim
data = pd.merge(data, sla, on = ["weekpair", the_cat])

In [38]:
data["total_new"] = data["weekpair"].map(data.groupby("weekpair")["dist2firstvalueLeak"].sum().to_dict())

In [39]:
total_trans = df.groupby(["weekpair", "itemID"]).size().reset_index().rename(columns = {0 : "total_trans"})

In [40]:
new_week = infos[["itemID"]].copy()
new_week["weekpair"] = 0
new_week["total_trans"] = 0
total_trans = pd.concat([total_trans, new_week], ignore_index=True)

In [41]:
total_trans.sort_values("weekpair", inplace = True)

In [42]:
total_trans["total_trans"] = total_trans.groupby("itemID")["total_trans"].shift()

In [43]:
data = pd.merge(data, total_trans, on = ["weekpair", "itemID"], how = "left")

In [44]:
data.fillna(0, inplace=True)

## - Min Expected Error

In [45]:
def evaluate(prediction, target, simulationPrice):
    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)

In [46]:
def evaluate_prop(prediction, target, simulationPrice):
    max_profit = np.sum((target)  * simulationPrice)
    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice) / max_profit

In [47]:
# max expected rmse
from sklearn.metrics import mean_squared_error as mse
# pred = data.loc[1:12].groupby('itemID')['order'].mean().sort_index()
# target_week = data.loc[13:, 'order'].reset_index(level = 0, drop = True).sort_index()
# mse(target_week, pred) ** .5

<hr>

##  - Split Data

In [48]:
weights = infos.set_index('itemID')['simulationPrice'].to_dict()

In [49]:
data.columns

Index(['weekpair', 'itemID', 'order', 'brand', 'manufacturer',
       'customerRating', 'category1', 'category2', 'category3',
       'recommendedRetailPrice', 'item_1', 'item_2', 'item_4', 'diff', 'diff2',
       'dist2firstvalueLeak', 'leak_cat3', 'total_new', 'total_trans'],
      dtype='object')

In [50]:
remove = set([
#     "weekpair",
    "brand",
#     "manufacturer",
#     "customerRating",
#     "category1",
#     "category2",
    "category3",
#     "recommendedRetailPrice", # very important!
#     "total_new",
#     "total_trans", # the master!
])
remove

{'brand', 'category3'}

In [51]:
sel_cols = set(list(data.columns)) - remove

In [52]:
# filtered_data = data
filtered_data = data.query("dist2firstvalueLeak != 1")[sel_cols]

In [53]:
len(data), len(filtered_data)

(146482, 136019)

In [54]:
sub_week = 0
train = filtered_data.query('-13 <= weekpair <= (@sub_week - 2)').reset_index(drop = True)
full_train = filtered_data.query('-13 <= weekpair <= (@sub_week - 1)').reset_index(drop = True)
val = filtered_data.query('weekpair == (@sub_week - 1)').reset_index(drop = True)
sub = filtered_data.query('weekpair == (@sub_week)').reset_index(drop = True)

In [55]:
sub["weekpair"] += 1

In [56]:
len(train), len(val), len(sub)

(116444, 9735, 9840)

In [57]:
y_train = train.pop('order').values
y_full_train = full_train.pop('order').values
y_val = val.pop('order').values
y_sub = sub.pop('order').values

w_train = train['itemID'].map(weights)
w_full_train = full_train['itemID'].map(weights)
w_val = val['itemID'].map(weights)
w_sub = sub['itemID'].map(weights)

i_train = train.pop("itemID")
i_full_train = full_train.pop("itemID")
i_val = val.pop("itemID")
i_sub = sub.pop("itemID")

X_train = train.values
X_full_train = full_train.values
X_val = val.values
X_sub = sub.values

<hr>

## - XGBoost

In [58]:
import xgboost as xgb

In [59]:
# custom objective

def gradient(prediction, dtrain):
    y = dtrain.get_label()
#     prediction.astype(int)
#     prediction = np.minimum(prediction.astype(int), 1)
    return -2 * (prediction - np.maximum(prediction - y, 0) * 1.6) * (1 - (prediction > y) * 1.6)

def hessian(prediction, dtrain):
    y = dtrain.get_label()
#     prediction.prediction(int)
#     prediction = np.minimum(prediction.astype(int), 1)
    return -2 * (1 - (prediction > y) * 1.6) ** 2

def objective(prediction, dtrain):
    w = dtrain.get_weight()
    grad = gradient(prediction, dtrain) * w
    hess = hessian(prediction, dtrain) * w
    return grad, hess

In [60]:
# custom feval

def feval(prediction, dtrain):
    prediction = prediction.astype(int)
#     predt = np.minimum(predt.astype(int), 1)
    target = dtrain.get_label()
    simulationPrice = dtrain.get_weight()
    max_profit = np.sum((target) * simulationPrice)
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice) / max_profit

In [61]:
missing = 0
dtrain = xgb.DMatrix(X_train, y_train, w_train, missing = missing)
dfulltrain = xgb.DMatrix(X_full_train, y_full_train, w_full_train, missing = missing)
dval = xgb.DMatrix(X_val, y_val, w_val, missing = missing)
dsub = xgb.DMatrix(X_sub, y_sub, w_sub, missing = missing)
# specify parameters via map
param = {
    'max_depth':7,
    'eta':0.005,
    'objective':'reg:squarederror',
    'disable_default_eval_metric': 1,
    "min_child_weight" : 10000,
    
#     'tree_method' : 'gpu_hist',
}
num_round = 800
bst = xgb.train(param, dtrain,
                num_round,
                early_stopping_rounds = 10,
                evals = [(dtrain, 'train'), (dval, 'val')],
#                 obj = objective,
                feval = feval,
                maximize = True,
                )

[0]	train-feval:0.00125	val-feval:0.00136
Multiple eval metrics have been passed: 'val-feval' will be used for early stopping.

Will train until val-feval hasn't improved in 10 rounds.
[1]	train-feval:0.00489	val-feval:0.00476
[2]	train-feval:0.00864	val-feval:0.01058
[3]	train-feval:0.01211	val-feval:0.01453
[4]	train-feval:0.01556	val-feval:0.01806
[5]	train-feval:0.01931	val-feval:0.02136
[6]	train-feval:0.02356	val-feval:0.02624
[7]	train-feval:0.02572	val-feval:0.02932
[8]	train-feval:0.02938	val-feval:0.03350
[9]	train-feval:0.03112	val-feval:0.03563
[10]	train-feval:0.03274	val-feval:0.03755
[11]	train-feval:0.03674	val-feval:0.03984
[12]	train-feval:0.04087	val-feval:0.04574
[13]	train-feval:0.04403	val-feval:0.04901
[14]	train-feval:0.04696	val-feval:0.05170
[15]	train-feval:0.04809	val-feval:0.05366
[16]	train-feval:0.05045	val-feval:0.05498
[17]	train-feval:0.05139	val-feval:0.05583
[18]	train-feval:0.05929	val-feval:0.06457
[19]	train-feval:0.06057	val-feval:0.06549
[20]	tr

In [62]:
prediction = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub), evaluate_prop(prediction, y_sub, w_sub)

(-2458791.9580000006, -2.0231528483317303)

In [63]:
prediction.sum(), len(np.unique(prediction))

(211119, 104)

In [64]:
# retrain!

In [65]:
bst_sub = xgb.train(param, dfulltrain,
    num_boost_round = bst.best_ntree_limit,
    #                 obj = objective,
    feval = feval, maximize = True,
    evals = [(dfulltrain, 'ftrain')],
    verbose_eval = False,
)
bst_sub.best_ntree_limit

121

In [66]:
prediction_old = bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit).astype(int)
evaluate(prediction_old, y_sub, w_sub), evaluate_prop(prediction_old, y_sub, w_sub)

(-2463317.2420000006, -2.026876360271942)

In [67]:
old_preds = {item : pred for item, pred in zip(i_sub, prediction_old)}

In [68]:
# some other things below

In [69]:
# max possible score
evaluate(y_sub, y_sub, w_sub)

1215326.8399999999

In [70]:
# using previous weekpair
# evaluate(y_val, y_sub, w_sub)

<hr>

In [90]:
# filtered_data = data
filtered_data = data.query("dist2firstvalueLeak == 1")[sel_cols]

In [91]:
len(data), len(filtered_data)

(146482, 10463)

In [92]:
#sub_week = 0
train = filtered_data.query('-13 <= weekpair <= (@sub_week - 2)').reset_index(drop = True)
full_train = filtered_data.query('-13 <= weekpair <= (@sub_week - 1)').reset_index(drop = True)
val = filtered_data.query('weekpair == (@sub_week - 1)').reset_index(drop = True)
sub = filtered_data.query('weekpair == (@sub_week)').reset_index(drop = True)

In [93]:
len(train), len(val), len(sub)

(9112, 728, 623)

In [94]:
y_train = train.pop('order').values
y_full_train = full_train.pop('order').values
y_val = val.pop('order').values
y_sub = sub.pop('order').values

w_train = train['itemID'].map(weights)
w_full_train = full_train['itemID'].map(weights)
w_val = val['itemID'].map(weights)
w_sub = sub['itemID'].map(weights)

i_train = train.pop("itemID")
i_full_train = full_train.pop("itemID")
i_val = val.pop("itemID")
i_sub = sub.pop("itemID")

X_train = train.values
X_full_train = full_train.values
X_val = val.values
X_sub = sub.values

## - XGBoost

In [95]:
missing = 0
dtrain = xgb.DMatrix(X_train, y_train, w_train, missing = missing)
dfulltrain = xgb.DMatrix(X_full_train, y_full_train, w_full_train, missing = missing)
dval = xgb.DMatrix(X_val, y_val, w_val, missing = missing)
dsub = xgb.DMatrix(X_sub, y_sub, w_sub, missing = missing)
# specify parameters via map
param = {
    'max_depth':4,
    'eta':0.005,
    'objective':'reg:squarederror',
    'disable_default_eval_metric': 1,
    "min_child_weight" : 10000,
    
#     'tree_method' : 'gpu_hist',
}
num_round = 800
bst = xgb.train(param, dtrain,
                num_round,
                early_stopping_rounds = 10,
                evals = [(dtrain, 'train'), (dval, 'val')],
#                 obj = objective,
                feval = feval,
                maximize = True,
                )

[0]	train-feval:0.00256	val-feval:0.00347
Multiple eval metrics have been passed: 'val-feval' will be used for early stopping.

Will train until val-feval hasn't improved in 10 rounds.
[1]	train-feval:0.00597	val-feval:0.00796
[2]	train-feval:0.01174	val-feval:0.01334
[3]	train-feval:0.01270	val-feval:0.01436
[4]	train-feval:0.01500	val-feval:0.01737
[5]	train-feval:0.01903	val-feval:0.02036
[6]	train-feval:0.02683	val-feval:0.02829
[7]	train-feval:0.03147	val-feval:0.03318
[8]	train-feval:0.03278	val-feval:0.03478
[9]	train-feval:0.03459	val-feval:0.03751
[10]	train-feval:0.03633	val-feval:0.03973
[11]	train-feval:0.03746	val-feval:0.04067
[12]	train-feval:0.04938	val-feval:0.04918
[13]	train-feval:0.05291	val-feval:0.05427
[14]	train-feval:0.05441	val-feval:0.05640
[15]	train-feval:0.05579	val-feval:0.05813
[16]	train-feval:0.05671	val-feval:0.05894
[17]	train-feval:0.05956	val-feval:0.06277
[18]	train-feval:0.06218	val-feval:0.06520
[19]	train-feval:0.06407	val-feval:0.06796
[20]	tr

[186]	train-feval:0.29757	val-feval:0.34125
[187]	train-feval:0.29820	val-feval:0.34223
[188]	train-feval:0.29865	val-feval:0.34281
[189]	train-feval:0.29903	val-feval:0.34569
[190]	train-feval:0.30006	val-feval:0.34794
[191]	train-feval:0.30035	val-feval:0.34828
[192]	train-feval:0.30072	val-feval:0.34877
[193]	train-feval:0.30236	val-feval:0.34909
[194]	train-feval:0.30303	val-feval:0.34944
[195]	train-feval:0.30369	val-feval:0.35029
[196]	train-feval:0.30388	val-feval:0.35051
[197]	train-feval:0.30454	val-feval:0.35091
[198]	train-feval:0.30505	val-feval:0.35158
[199]	train-feval:0.30531	val-feval:0.35195
[200]	train-feval:0.30572	val-feval:0.35251
[201]	train-feval:0.30610	val-feval:0.35280
[202]	train-feval:0.30679	val-feval:0.35430
[203]	train-feval:0.30712	val-feval:0.35474
[204]	train-feval:0.30723	val-feval:0.35495
[205]	train-feval:0.30777	val-feval:0.35546
[206]	train-feval:0.30854	val-feval:0.35692
[207]	train-feval:0.30925	val-feval:0.35759
[208]	train-feval:0.30946	val-fe

[373]	train-feval:0.35262	val-feval:0.40720
[374]	train-feval:0.35274	val-feval:0.40722
[375]	train-feval:0.35306	val-feval:0.40779
[376]	train-feval:0.35326	val-feval:0.40785
[377]	train-feval:0.35337	val-feval:0.40801
[378]	train-feval:0.35344	val-feval:0.40804
[379]	train-feval:0.35355	val-feval:0.40829
[380]	train-feval:0.35354	val-feval:0.40840
[381]	train-feval:0.35374	val-feval:0.40848
[382]	train-feval:0.35398	val-feval:0.40853
[383]	train-feval:0.35404	val-feval:0.40853
[384]	train-feval:0.35407	val-feval:0.40854
[385]	train-feval:0.35417	val-feval:0.40857
[386]	train-feval:0.35435	val-feval:0.40880
[387]	train-feval:0.35440	val-feval:0.40884
[388]	train-feval:0.35457	val-feval:0.40897
[389]	train-feval:0.35465	val-feval:0.40852
[390]	train-feval:0.35478	val-feval:0.40856
[391]	train-feval:0.35481	val-feval:0.40849
[392]	train-feval:0.35491	val-feval:0.40851
[393]	train-feval:0.35494	val-feval:0.40866
[394]	train-feval:0.35496	val-feval:0.40872
[395]	train-feval:0.35509	val-fe

In [96]:
prediction = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub), evaluate_prop(prediction, y_sub, w_sub)

(-735639.9660000001, -12.00608898907564)

In [97]:
prediction.sum()

43902

In [98]:
# retrain!

In [99]:
bst_sub = xgb.train(param, dfulltrain,
    num_boost_round = bst.best_ntree_limit,
    #                 obj = objective,
    feval = feval, maximize = True,
    evals = [(dfulltrain, 'ftrain')],
    verbose_eval = False,
)
bst_sub.best_ntree_limit

428

In [100]:
prediction_new = bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit).astype(int)
evaluate(prediction_new, y_sub, w_sub), evaluate_prop(prediction_new, y_sub, w_sub)

(-749497.0980000002, -12.23224576088617)

In [101]:
new_preds = {item : pred for item, pred in zip(i_sub, prediction_new)}

In [102]:
# some other things below

In [103]:
# max possible score
evaluate(y_sub, y_sub, w_sub)

61272.24000000001

In [104]:
# using previous weekpair
# evaluate(y_val, y_sub, w_sub)

In [105]:
preds = {**old_preds, **new_preds}

In [111]:
len(preds), len(infos), len(old_preds), len(new_preds)

(10463, 10463, 9840, 623)

In [113]:
preds

{1: 17,
 31: 10,
 32: 34,
 37: 10,
 108: 29,
 109: 15,
 112: 7,
 281: 29,
 282: 29,
 283: 23,
 387: 16,
 388: 15,
 430: 6,
 431: 7,
 688: 5,
 720: 9,
 721: 29,
 907: 10,
 908: 5,
 1039: 22,
 1097: 9,
 1300: 16,
 1301: 23,
 1302: 29,
 1303: 7,
 1304: 8,
 1305: 9,
 1306: 7,
 1307: 11,
 1308: 8,
 1309: 12,
 1310: 9,
 1451: 10,
 1452: 9,
 1453: 11,
 1454: 9,
 1604: 5,
 1641: 6,
 1642: 17,
 1643: 7,
 2: 31,
 4: 15,
 5: 20,
 6: 16,
 11: 20,
 12: 19,
 14: 32,
 28: 17,
 48: 27,
 64: 36,
 65: 25,
 68: 30,
 69: 41,
 72: 25,
 73: 38,
 74: 14,
 90: 10,
 148: 13,
 151: 21,
 163: 25,
 164: 13,
 236: 41,
 237: 24,
 238: 38,
 239: 31,
 253: 9,
 259: 10,
 260: 9,
 261: 9,
 262: 36,
 263: 10,
 269: 38,
 270: 9,
 271: 12,
 272: 15,
 284: 15,
 285: 7,
 286: 26,
 288: 25,
 289: 38,
 290: 35,
 291: 15,
 292: 9,
 293: 7,
 342: 23,
 343: 13,
 344: 10,
 345: 10,
 347: 13,
 348: 17,
 349: 9,
 350: 23,
 351: 8,
 366: 27,
 371: 14,
 372: 15,
 373: 32,
 374: 9,
 382: 41,
 408: 13,
 409: 13,
 410: 10,
 432: 26,
 43

In [110]:
submission = items[['itemID']].copy()
submission['demandPrediction'] = items["itemID"].map(preds)
submission.to_csv('../../submissions/final_submission.csv', sep = '|', index=False)
submission.head()

Unnamed: 0,itemID,demandPrediction
0,1,17
1,2,31
2,3,7
3,4,15
4,5,20


In [114]:
(submission["demandPrediction"] == 0).sum()

0