In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
infos = pd.read_csv('../../data/infos.csv', sep = '|')

In [3]:
items = pd.read_csv('../../data/items.csv', sep = '|')

In [4]:
orders = pd.read_csv('../../data/orders.csv', sep = '|', parse_dates=['time'])

## - Creating the structure

In [5]:
df = orders.copy()

In [6]:
df['weekpair'] = (df.time.dt.dayofyear + 1) // 14 - 13

In [7]:
npairs = df.weekpair.nunique()

In [8]:
n_items = items['itemID'].nunique()
print('total number of items:', n_items)
print('expected number of instances:', n_items * npairs)

total number of items: 10463
expected number of instances: 136019


In [9]:
mi = pd.MultiIndex.from_product([range(-npairs, 0 + 1), items['itemID']], names=['weekpair', 'itemID'])
data_temp = pd.DataFrame(index = mi)

In [10]:
data_temp = data_temp.join(df.groupby(['weekpair', 'itemID'])[['order']].sum(), how = 'left')

In [11]:
data_temp.fillna(0, inplace = True)

In [12]:
data_temp.groupby('itemID').count().min()

order    14
dtype: int64

## - Creating features

In [13]:
def gbagg(data, group_cols, targeted_cols, out_names, function, as_index = False):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = []
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval)
        new_feat.append(output)

        # go to next group
        i = j
    
    idx = data.groupby(group_cols).size().index # this is actually fast...
    out_df = pd.DataFrame(new_feat, columns = out_names, index = idx)
        
    if not as_index:
        out_df.reset_index(inplace = True)
        
    return out_df

In [14]:
def gbtransf(data, group_cols, targeted_cols, out_names, function, params = dict()):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = np.zeros((len(data), len(out_names)))
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval, **params)
        new_feat[i:j] = output

        # go to next group
        i = j
    
    out_df = pd.DataFrame(new_feat, columns = out_names, index = data.index)
        
    return out_df

In [15]:
def shift_and_2n_window(x, ws):
#     out = pd.DataFrame(x)
#     out = out.shift()
#     out = out.rolling(2 ** n).mean()

    shifted = np.zeros_like(x) # output
    shifted[1:] = x[:-1] # shift
    out = np.zeros_like(x, dtype = float)
    
    # rolling mean
    total = shifted[:ws].sum()
    out[ws - 1] = total / ws
    for i in range(ws, len(out)):
        total = total - shifted[i - ws] + shifted[i]
        out[i] = total / ws
    out[:ws] = np.NaN # maybe ws -1 should be NaN as well for receiving one NaN value when ws > 1
    # out[0] = np.NaN # this is always NaN for a shift of 1
    return out

In [16]:
data = data_temp.reset_index()
data = pd.merge(data, items, on = 'itemID')

In [17]:
data.loc[data["weekpair"] == 0, "order"] = 1 # facilita para fazer feature que detecta item novo

In [18]:
data.sort_values(['itemID', 'weekpair'], inplace = True)

In [19]:
# gbtransf(data, ['itemID', 'weekpair'], ['order'], ['out'], lambda x : np.ones_like(x))

In [20]:
shift_and_2n_window(np.array([1 , 2, 3, 4, 5, 6]), 2 ** 1)

array([nan, nan, 1.5, 2.5, 3.5, 4.5])

In [21]:
features = [('itemID', 'item')]

In [22]:
for f, name in features:
    print(f)
    new_feature_block = pd.DataFrame()
    for n in range(3):
        new_f = gbtransf(data, ['itemID'], ['order'], ['out'], shift_and_2n_window, {'ws' : 2 ** n})
        new_feature_block['%s_%d' % (name, 2 ** n)] = new_f['out']
#     data = pd.merge(data, new_feature_block.reset_index(), on = [f, 'weekpair'])
    data = pd.concat([data, new_feature_block], axis =  1)

itemID


In [23]:
data["diff"] = data.groupby("order")["order"].shift()
data["diff"] = data.groupby("order")["diff"].diff()
data["diff2"] = data.groupby("order")["diff"].diff()

In [24]:
data.count() # the larger the window, more NaN are expected

weekpair                  146482
itemID                    146482
order                     146482
brand                     146482
manufacturer              146482
customerRating            146482
category1                 146482
category2                 146482
category3                 146482
recommendedRetailPrice    146482
item_1                    136019
item_2                    125556
item_4                    104630
diff                      144694
diff2                     144113
dtype: int64

In [25]:
def dist2firstvalueLeak(x):
    out = np.zeros_like(x, dtype = float)
    for i in range(len(x)):
        if x[i] != 0:
            out[i] = 1
            break
#         else:
#             out[i] = -9999
        
    return out

In [26]:
dist2firstvalueLeak(np.array([0 , 0, 3, 0, 5, 6]))

array([0., 0., 1., 0., 0., 0.])

In [27]:
data.sort_values(['itemID', 'weekpair'], inplace = True)

In [28]:
data['dist2firstvalueLeak'] = gbtransf(data, ['itemID'], ['order'], ['out'], dist2firstvalueLeak)['out']
# data['dist2lastpeak'] = gbtransf(data, ['itemID'], ['order'], ['out'], dist2lastpeak)['out']

In [29]:
the_cat = "manufacturer"

In [30]:
sla = data.groupby(["weekpair", the_cat])["dist2firstvalueLeak"].sum().reset_index()

In [31]:
sla = sla.rename(columns={"dist2firstvalueLeak" : "leak_cat3"})

In [32]:
# acho que mudou nada... opa, mudou sim
data = pd.merge(data, sla, on = ["weekpair", the_cat])

In [33]:
data["total_new"] = data["weekpair"].map(data.groupby("weekpair")["dist2firstvalueLeak"].sum().to_dict())

In [34]:
total_trans = df.groupby(["weekpair", "itemID"]).size().reset_index().rename(columns = {0 : "total_trans"})

In [35]:
new_week = infos[["itemID"]].copy()
new_week["weekpair"] = 0
new_week["total_trans"] = 0
total_trans = pd.concat([total_trans, new_week], ignore_index=True)

In [36]:
total_trans.sort_values("weekpair", inplace = True)

In [37]:
total_trans["total_trans"] = total_trans.groupby("itemID")["total_trans"].shift()

In [38]:
data = pd.merge(data, total_trans, on = ["weekpair", "itemID"], how = "left")

In [39]:
data.fillna(0, inplace=True)

## - Min Expected Error

In [40]:
def evaluate(prediction, target, simulationPrice):
    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)

In [41]:
def evaluate_prop(prediction, target, simulationPrice):
    max_profit = np.sum((target)  * simulationPrice)
    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice) / max_profit

In [42]:
# max expected rmse
from sklearn.metrics import mean_squared_error as mse
# pred = data.loc[1:12].groupby('itemID')['order'].mean().sort_index()
# target_week = data.loc[13:, 'order'].reset_index(level = 0, drop = True).sort_index()
# mse(target_week, pred) ** .5

<hr>

##  - Split Data

In [43]:
weights = infos.set_index('itemID')['simulationPrice'].to_dict()

In [44]:
data.columns

Index(['weekpair', 'itemID', 'order', 'brand', 'manufacturer',
       'customerRating', 'category1', 'category2', 'category3',
       'recommendedRetailPrice', 'item_1', 'item_2', 'item_4', 'diff', 'diff2',
       'dist2firstvalueLeak', 'leak_cat3', 'total_new', 'total_trans'],
      dtype='object')

In [45]:
remove = set([
#     "weekpair",
    "brand",
#     "manufacturer",
#     "customerRating",
#     "category1",
#     "category2",
    "category3",
#     "recommendedRetailPrice", # very important!
#     "total_new",
#     "total_trans", # the master!
])
remove

{'brand', 'category3'}

In [46]:
sel_cols = set(list(data.columns)) - remove

In [97]:
# filtered_data = data
filtered_data = data.query("dist2firstvalueLeak != 1")[sel_cols]

In [98]:
len(data), len(filtered_data)

(146482, 136019)

In [99]:
sub_week = -1
train = filtered_data.query('-13 <= weekpair <= (@sub_week - 2)').reset_index(drop = True)
full_train = filtered_data.query('-13 <= weekpair <= (@sub_week - 1)').reset_index(drop = True)
val = filtered_data.query('weekpair == (@sub_week - 1)').reset_index(drop = True)
sub = filtered_data.query('weekpair == (@sub_week)').reset_index(drop = True)

In [100]:
sub["weekpair"] += 1

In [101]:
len(train), len(val), len(sub)

(106708, 9736, 9735)

In [102]:
y_train = train.pop('order').values
y_full_train = full_train.pop('order').values
y_val = val.pop('order').values
y_sub = sub.pop('order').values

w_train = train['itemID'].map(weights)
w_full_train = full_train['itemID'].map(weights)
w_val = val['itemID'].map(weights)
w_sub = sub['itemID'].map(weights)

i_train = train.pop("itemID")
i_full_train = full_train.pop("itemID")
i_val = val.pop("itemID")
i_sub = sub.pop("itemID")

X_train = train.values
X_full_train = full_train.values
X_val = val.values
X_sub = sub.values

<hr>

## - XGBoost

In [103]:
import xgboost as xgb

In [104]:
# custom objective

def gradient(prediction, dtrain):
    y = dtrain.get_label()
#     prediction.astype(int)
#     prediction = np.minimum(prediction.astype(int), 1)
    return -2 * (prediction - np.maximum(prediction - y, 0) * 1.6) * (1 - (prediction > y) * 1.6)

def hessian(prediction, dtrain):
    y = dtrain.get_label()
#     prediction.prediction(int)
#     prediction = np.minimum(prediction.astype(int), 1)
    return -2 * (1 - (prediction > y) * 1.6) ** 2

def objective(prediction, dtrain):
    w = dtrain.get_weight()
    grad = gradient(prediction, dtrain) * w
    hess = hessian(prediction, dtrain) * w
    return grad, hess

In [105]:
# custom feval

def feval(prediction, dtrain):
    prediction = prediction.astype(int)
#     predt = np.minimum(predt.astype(int), 1)
    target = dtrain.get_label()
    simulationPrice = dtrain.get_weight()
    max_profit = np.sum((target) * simulationPrice)
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice) / max_profit

In [106]:
missing = 0
dtrain = xgb.DMatrix(X_train, y_train, w_train, missing = missing)
dfulltrain = xgb.DMatrix(X_full_train, y_full_train, w_full_train, missing = missing)
dval = xgb.DMatrix(X_val, y_val, w_val, missing = missing)
dsub = xgb.DMatrix(X_sub, y_sub, w_sub, missing = missing)
# specify parameters via map
param = {
    'max_depth':7,
    'eta':0.005,
    'objective':'reg:squarederror',
    'disable_default_eval_metric': 1,
    "min_child_weight" : 10000,
    
#     'tree_method' : 'gpu_hist',
}
num_round = 800
bst = xgb.train(param, dtrain,
                num_round,
                early_stopping_rounds = 10,
                evals = [(dtrain, 'train'), (dval, 'val')],
#                 obj = objective,
                feval = feval,
                maximize = True,
                )

[0]	train-feval:0.00179	val-feval:0.00150
Multiple eval metrics have been passed: 'val-feval' will be used for early stopping.

Will train until val-feval hasn't improved in 10 rounds.
[1]	train-feval:0.00546	val-feval:0.00578
[2]	train-feval:0.00784	val-feval:0.00785
[3]	train-feval:0.01136	val-feval:0.01270
[4]	train-feval:0.01490	val-feval:0.01660
[5]	train-feval:0.01742	val-feval:0.01953
[6]	train-feval:0.02109	val-feval:0.02222
[7]	train-feval:0.02524	val-feval:0.02617
[8]	train-feval:0.02977	val-feval:0.03182
[9]	train-feval:0.03363	val-feval:0.03595
[10]	train-feval:0.03954	val-feval:0.04280
[11]	train-feval:0.04141	val-feval:0.04562
[12]	train-feval:0.04259	val-feval:0.04688
[13]	train-feval:0.04471	val-feval:0.04916
[14]	train-feval:0.04663	val-feval:0.05039
[15]	train-feval:0.04802	val-feval:0.05195
[16]	train-feval:0.05006	val-feval:0.05378
[17]	train-feval:0.05101	val-feval:0.05477
[18]	train-feval:0.05188	val-feval:0.05544
[19]	train-feval:0.06033	val-feval:0.05998
[20]	tr

In [107]:
prediction = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub), evaluate_prop(prediction, y_sub, w_sub)

(589962.3579999998, 0.12020379543372957)

In [108]:
prediction.sum(), len(np.unique(prediction))

(107874, 121)

In [109]:
# retrain!

In [110]:
bst_sub = xgb.train(param, dfulltrain,
    num_boost_round = bst.best_ntree_limit,
    #                 obj = objective,
    feval = feval, maximize = True,
    evals = [(dfulltrain, 'ftrain')],
    verbose_eval = False,
)
bst_sub.best_ntree_limit

157

In [111]:
prediction_old = bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit).astype(int)
evaluate(prediction_old, y_sub, w_sub), evaluate_prop(prediction_old, y_sub, w_sub)

(545898.2879999998, 0.11122581847565127)

In [112]:
old_preds = {item : pred for item, pred in zip(i_sub, prediction_old)}

In [66]:
# some other things below

In [67]:
# max possible score
evaluate(y_sub, y_sub, w_sub)

4908017.72

In [68]:
# using previous weekpair
# evaluate(y_val, y_sub, w_sub)

<hr>

In [85]:
# filtered_data = data
filtered_data = data.query("dist2firstvalueLeak == 1")[sel_cols]

In [86]:
len(data), len(filtered_data)

(146482, 10463)

In [87]:
#sub_week = 0
train = filtered_data.query('-13 <= weekpair <= (@sub_week - 2)').reset_index(drop = True)
full_train = filtered_data.query('-13 <= weekpair <= (@sub_week - 1)').reset_index(drop = True)
val = filtered_data.query('weekpair == (@sub_week - 1)').reset_index(drop = True)
sub = filtered_data.query('weekpair == (@sub_week)').reset_index(drop = True)

In [88]:
len(train), len(val), len(sub)

(8385, 727, 728)

In [89]:
y_train = train.pop('order').values
y_full_train = full_train.pop('order').values
y_val = val.pop('order').values
y_sub = sub.pop('order').values

w_train = train['itemID'].map(weights)
w_full_train = full_train['itemID'].map(weights)
w_val = val['itemID'].map(weights)
w_sub = sub['itemID'].map(weights)

i_train = train.pop("itemID")
i_full_train = full_train.pop("itemID")
i_val = val.pop("itemID")
i_sub = sub.pop("itemID")

X_train = train.values
X_full_train = full_train.values
X_val = val.values
X_sub = sub.values

## - XGBoost

In [90]:
missing = 0
dtrain = xgb.DMatrix(X_train, y_train, w_train, missing = missing)
dfulltrain = xgb.DMatrix(X_full_train, y_full_train, w_full_train, missing = missing)
dval = xgb.DMatrix(X_val, y_val, w_val, missing = missing)
dsub = xgb.DMatrix(X_sub, y_sub, w_sub, missing = missing)
# specify parameters via map
param = {
    'max_depth':4,
    'eta':0.005,
    'objective':'reg:squarederror',
    'disable_default_eval_metric': 1,
    "min_child_weight" : 10000,
    
#     'tree_method' : 'gpu_hist',
}
num_round = 800
bst = xgb.train(param, dtrain,
                num_round,
                early_stopping_rounds = 10,
                evals = [(dtrain, 'train'), (dval, 'val')],
#                 obj = objective,
                feval = feval,
                maximize = True,
                )

[0]	train-feval:0.00275	val-feval:0.00359
Multiple eval metrics have been passed: 'val-feval' will be used for early stopping.

Will train until val-feval hasn't improved in 10 rounds.
[1]	train-feval:0.00504	val-feval:0.00617
[2]	train-feval:0.01171	val-feval:0.01334
[3]	train-feval:0.01331	val-feval:0.01557
[4]	train-feval:0.01549	val-feval:0.01796
[5]	train-feval:0.01895	val-feval:0.02028
[6]	train-feval:0.02065	val-feval:0.02250
[7]	train-feval:0.03120	val-feval:0.03087
[8]	train-feval:0.03248	val-feval:0.03293
[9]	train-feval:0.03526	val-feval:0.03658
[10]	train-feval:0.03624	val-feval:0.03744
[11]	train-feval:0.03775	val-feval:0.03968
[12]	train-feval:0.05137	val-feval:0.04810
[13]	train-feval:0.05385	val-feval:0.05111
[14]	train-feval:0.05455	val-feval:0.05187
[15]	train-feval:0.05607	val-feval:0.05420
[16]	train-feval:0.05713	val-feval:0.05572
[17]	train-feval:0.05989	val-feval:0.05775
[18]	train-feval:0.06301	val-feval:0.06219
[19]	train-feval:0.06421	val-feval:0.06406
[20]	tr

[186]	train-feval:0.29620	val-feval:0.31116
[187]	train-feval:0.29679	val-feval:0.31248
[188]	train-feval:0.29717	val-feval:0.31211
[189]	train-feval:0.29756	val-feval:0.31269
[190]	train-feval:0.29821	val-feval:0.31375
[191]	train-feval:0.29886	val-feval:0.31520
[192]	train-feval:0.29925	val-feval:0.31545
[193]	train-feval:0.29963	val-feval:0.31596
[194]	train-feval:0.30036	val-feval:0.31728
[195]	train-feval:0.30074	val-feval:0.31822
[196]	train-feval:0.30091	val-feval:0.31851
[197]	train-feval:0.30139	val-feval:0.31921
[198]	train-feval:0.30191	val-feval:0.31989
[199]	train-feval:0.30236	val-feval:0.32037
[200]	train-feval:0.30264	val-feval:0.32075
[201]	train-feval:0.30319	val-feval:0.32121
[202]	train-feval:0.30352	val-feval:0.32135
[203]	train-feval:0.30392	val-feval:0.32195
[204]	train-feval:0.30407	val-feval:0.32284
[205]	train-feval:0.30444	val-feval:0.32332
[206]	train-feval:0.30512	val-feval:0.32402
[207]	train-feval:0.30532	val-feval:0.32444
[208]	train-feval:0.30640	val-fe

[373]	train-feval:0.34998	val-feval:0.38026
[374]	train-feval:0.35016	val-feval:0.38090
[375]	train-feval:0.35017	val-feval:0.38105
[376]	train-feval:0.35022	val-feval:0.38147
[377]	train-feval:0.35036	val-feval:0.38169
[378]	train-feval:0.35047	val-feval:0.38179
[379]	train-feval:0.35050	val-feval:0.38178
[380]	train-feval:0.35057	val-feval:0.38190
[381]	train-feval:0.35067	val-feval:0.38224
[382]	train-feval:0.35070	val-feval:0.38246
[383]	train-feval:0.35090	val-feval:0.38243
[384]	train-feval:0.35096	val-feval:0.38250
[385]	train-feval:0.35112	val-feval:0.38296
[386]	train-feval:0.35110	val-feval:0.38287
[387]	train-feval:0.35123	val-feval:0.38287
[388]	train-feval:0.35131	val-feval:0.38294
[389]	train-feval:0.35130	val-feval:0.38295
[390]	train-feval:0.35139	val-feval:0.38308
[391]	train-feval:0.35145	val-feval:0.38320
[392]	train-feval:0.35148	val-feval:0.38328
[393]	train-feval:0.35145	val-feval:0.38388
[394]	train-feval:0.35160	val-feval:0.38395
[395]	train-feval:0.35158	val-fe

[560]	train-feval:0.36154	val-feval:0.39658
[561]	train-feval:0.36154	val-feval:0.39658
[562]	train-feval:0.36164	val-feval:0.39672
[563]	train-feval:0.36155	val-feval:0.39688
[564]	train-feval:0.36163	val-feval:0.39696
[565]	train-feval:0.36166	val-feval:0.39745
[566]	train-feval:0.36163	val-feval:0.39748
[567]	train-feval:0.36156	val-feval:0.39744
[568]	train-feval:0.36158	val-feval:0.39743
[569]	train-feval:0.36160	val-feval:0.39748
[570]	train-feval:0.36162	val-feval:0.39748
[571]	train-feval:0.36166	val-feval:0.39753
[572]	train-feval:0.36152	val-feval:0.39756
[573]	train-feval:0.36153	val-feval:0.39760
[574]	train-feval:0.36162	val-feval:0.39760
[575]	train-feval:0.36164	val-feval:0.39756
[576]	train-feval:0.36146	val-feval:0.39763
[577]	train-feval:0.36150	val-feval:0.39756
[578]	train-feval:0.36156	val-feval:0.39765
[579]	train-feval:0.36177	val-feval:0.39759
[580]	train-feval:0.36189	val-feval:0.39763
[581]	train-feval:0.36173	val-feval:0.39763
[582]	train-feval:0.36176	val-fe

In [91]:
prediction = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub), evaluate_prop(prediction, y_sub, w_sub)

(1184780.54, 0.42690555638958694)

In [92]:
prediction.sum() 

107806

In [93]:
# retrain!

In [94]:
bst_sub = xgb.train(param, dfulltrain,
    num_boost_round = bst.best_ntree_limit,
    #                 obj = objective,
    feval = feval, maximize = True,
    evals = [(dfulltrain, 'ftrain')],
    verbose_eval = False,
)
bst_sub.best_ntree_limit

602

In [95]:
prediction_new = bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit).astype(int)
evaluate(prediction_new, y_sub, w_sub), evaluate_prop(prediction_new, y_sub, w_sub)

(1172361.756, 0.4224307631986032)

In [96]:
new_preds = {item : pred for item, pred in zip(i_sub, prediction_new)}

In [102]:
# some other things below

In [103]:
# max possible score
evaluate(y_sub, y_sub, w_sub)

61272.24000000001

In [104]:
# using previous weekpair
# evaluate(y_val, y_sub, w_sub)

In [115]:
preds = {**old_preds, **new_preds}

In [116]:
len(preds), len(infos), len(old_preds), len(new_preds)

(10463, 10463, 9735, 728)

In [110]:
# submission = items[['itemID']].copy()
# submission['demandPrediction'] = items["itemID"].map(preds)
# submission.to_csv('../../submissions/final_submission.csv', sep = '|', index=False)
# submission.head()

Unnamed: 0,itemID,demandPrediction
0,1,17
1,2,31
2,3,7
3,4,15
4,5,20


In [114]:
(submission["demandPrediction"] == 0).sum()

0