In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
infos = pd.read_csv('../../data/infos.csv', sep = '|')

In [3]:
items = pd.read_csv('../../data/items.csv', sep = '|')

In [4]:
orders = pd.read_csv('../../data/orders.csv', sep = '|', parse_dates=['time'])

## - Creating the structure

In [5]:
df = orders.copy()

In [6]:
df['weekpair'] = (df.time.dt.dayofyear + 1) // 14 - 13

In [7]:
npairs = df.weekpair.nunique()

In [8]:
n_items = items['itemID'].nunique()
print('total number of items:', n_items)
print('expected number of instances:', n_items * npairs)

total number of items: 10463
expected number of instances: 136019


In [9]:
mi = pd.MultiIndex.from_product([range(-npairs, 0), items['itemID']], names=['weekpair', 'itemID'])
data_temp = pd.DataFrame(index = mi)

In [10]:
data_temp = data_temp.join(df.groupby(['weekpair', 'itemID'])[['order']].sum(), how = 'left')

In [11]:
data_temp.fillna(0, inplace = True)

In [12]:
data_temp.groupby('itemID').count().min()

order    13
dtype: int64

## - Creating features

In [13]:
def gbagg(data, group_cols, targeted_cols, out_names, function, as_index = False):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = []
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval)
        new_feat.append(output)

        # go to next group
        i = j
    
    idx = data.groupby(group_cols).size().index # this is actually fast...
    out_df = pd.DataFrame(new_feat, columns = out_names, index = idx)
        
    if not as_index:
        out_df.reset_index(inplace = True)
        
    return out_df

In [14]:
def gbtransf(data, group_cols, targeted_cols, out_names, function, params = dict()):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = np.zeros((len(data), len(out_names)))
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval, **params)
        new_feat[i:j] = output

        # go to next group
        i = j
    
    out_df = pd.DataFrame(new_feat, columns = out_names, index = data.index)
        
    return out_df

In [15]:
def shift_and_2n_window(x, ws):
#     out = pd.DataFrame(x)
#     out = out.shift()
#     out = out.rolling(2 ** n).mean()

    shifted = np.zeros_like(x) # output
    shifted[1:] = x[:-1] # shift
    out = np.zeros_like(x, dtype = float)
    
    # rolling mean
    total = shifted[:ws].sum()
    out[ws - 1] = total / ws
    for i in range(ws, len(out)):
        total = total - shifted[i - ws] + shifted[i]
        out[i] = total / ws
    out[:ws] = np.NaN # maybe ws -1 should be NaN as well for receiving one NaN value when ws > 1
    # out[0] = np.NaN # this is always NaN for a shift of 1
    return out

In [16]:
data = data_temp.reset_index()
data = pd.merge(data, items, on = 'itemID')

In [17]:
data.sort_values(['itemID', 'weekpair'], inplace = True)

In [18]:
# gbtransf(data, ['itemID', 'weekpair'], ['order'], ['out'], lambda x : np.ones_like(x))

In [19]:
shift_and_2n_window(np.array([1 , 2, 3, 4, 5, 6]), 2 ** 1)

array([nan, nan, 1.5, 2.5, 3.5, 4.5])

In [20]:
features = [('itemID', 'item')]

In [21]:
for f, name in features:
    print(f)
    new_feature_block = pd.DataFrame()
    for n in range(3):
        new_f = gbtransf(data, ['itemID'], ['order'], ['out'], shift_and_2n_window, {'ws' : 2 ** n})
        new_feature_block['%s_%d' % (name, 2 ** n)] = new_f['out']
#     data = pd.merge(data, new_feature_block.reset_index(), on = [f, 'weekpair'])
    data = pd.concat([data, new_feature_block], axis =  1)

itemID


In [22]:
data.count() # the larger the window, more NaN are expected

weekpair                  136019
itemID                    136019
order                     136019
brand                     136019
manufacturer              136019
customerRating            136019
category1                 136019
category2                 136019
category3                 136019
recommendedRetailPrice    136019
item_1                    125556
item_2                    115093
item_4                     94167
dtype: int64

In [23]:
def dist2firstvalue(x):
    out = np.zeros_like(x, dtype = float)
    first = np.NaN
    for i in range(len(x)):
        out[i] = first
        if x[i] != 0:
            first = i
            break
    if i == len(x) - 1:
        return out
    for j in range(int(first), len(x)):
        out[j] = j - first
        
    return out

In [24]:
dist2firstvalue(np.array([0 , 0, 0, 0])), dist2firstvalue(np.array([0 , 0, 3, 0, 5, 6]))

(array([nan, nan, nan, nan]), array([nan, nan,  0.,  1.,  2.,  3.]))

In [25]:
def dist2firstvalueLeak(x):
    out = np.zeros_like(x, dtype = float)
    for i in range(len(x)):
        if x[i] != 0:
            out[i] = 1
            break
#         else:
#             out[i] = -9999
        
    return out

# def dist2firstvalueLeakAdding(x):
#     out = np.zeros_like(x, dtype = float)
#     d = 1
#     for i in range(len(x)):
#         if x[i] != 0:
#             out[i] = d
#             break
#     d += 1
#     for j in range(i + 1, len(x)):
#         out[j] = d
#         d += 1
#     return out

In [26]:
dist2firstvalueLeak(np.array([0 , 0, 3, 0, 5, 6]))
# dist2firstvalue(np.array([0 , 0, 0, 0]))

array([0., 0., 1., 0., 0., 0.])

In [27]:
# def dist2lastpeak(x):
#     out = np.zeros_like(x, dtype = float)
#     peak = np.NaN
#     peak_val = 0
#     for i in range(0, len(x)):
#         out[i] = i - peak
#         if x[i] > peak_val:
#             peak = i
#             peak_val = x[i]
        
#     return out

In [28]:
# dist2lastpeak(np.array([0 , 0, 3, 0, 5, 6]))

In [29]:
data.sort_values(['itemID', 'weekpair'], inplace = True)

In [30]:
data['dist2firstvalueLeak'] = gbtransf(data, ['itemID'], ['order'], ['out'], dist2firstvalueLeak)['out']
# data['dist2lastpeak'] = gbtransf(data, ['itemID'], ['order'], ['out'], dist2lastpeak)['out']

In [31]:
the_cat = "manufacturer"

In [32]:
sla = data.groupby(["weekpair", the_cat])["dist2firstvalueLeak"].sum().reset_index()

In [33]:
sla = sla.rename(columns={"dist2firstvalueLeak" : "leak_cat3"})

In [34]:
# acho que mudou nada... opa, mudou sim
data = pd.merge(data, sla, on = ["weekpair", the_cat])

In [35]:
data["total_new"] = data["weekpair"].map(data.groupby("weekpair")["dist2firstvalueLeak"].sum().to_dict())

In [36]:
data.fillna(0, inplace=True)

In [37]:
# checking if we got what we wanted
data.query('itemID == 1')

Unnamed: 0,weekpair,itemID,order,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,item_1,item_2,item_4,dist2firstvalueLeak,leak_cat3,total_new
0,-13,1,0.0,0,1,4.38,1,1,1,8.84,0.0,0.0,0.0,0.0,5.0,1431.0
40,-12,1,2.0,0,1,4.38,1,1,1,8.84,0.0,0.0,0.0,1.0,4.0,729.0
80,-11,1,313.0,0,1,4.38,1,1,1,8.84,2.0,1.0,0.0,0.0,6.0,371.0
120,-10,1,35.0,0,1,4.38,1,1,1,8.84,313.0,157.5,0.0,0.0,3.0,533.0
160,-9,1,3.0,0,1,4.38,1,1,1,8.84,35.0,174.0,87.5,0.0,0.0,785.0
200,-8,1,1.0,0,1,4.38,1,1,1,8.84,3.0,19.0,88.25,0.0,1.0,909.0
240,-7,1,1.0,0,1,4.38,1,1,1,8.84,1.0,2.0,88.0,0.0,0.0,716.0
280,-6,1,2.0,0,1,4.38,1,1,1,8.84,1.0,1.0,10.0,0.0,1.0,661.0
320,-5,1,299.0,0,1,4.38,1,1,1,8.84,2.0,1.5,1.75,0.0,12.0,785.0
360,-4,1,3.0,0,1,4.38,1,1,1,8.84,299.0,150.5,75.75,0.0,1.0,671.0


##  - Split Data

In [38]:
weights = infos.set_index('itemID')['simulationPrice'].to_dict()

In [39]:
# filtered_data = data
filtered_data = data.query("dist2firstvalueLeak == 1")

In [40]:
len(data), len(filtered_data)

(136019, 9840)

In [41]:
# filtered_data.pop("itemID");

In [42]:
sub_week = -1
train = filtered_data.query('-13 <= weekpair <= (@sub_week - 2)').reset_index(drop = True)
full_train = filtered_data.query('-13 <= weekpair <= (@sub_week - 1)').reset_index(drop = True)
val = filtered_data.query('weekpair == (@sub_week - 1)').reset_index(drop = True)
sub = filtered_data.query('weekpair == (@sub_week)').reset_index(drop = True)

In [43]:
len(train), len(val), len(sub)

(8385, 727, 728)

In [44]:
y_train = train.pop('order').values
y_full_train = full_train.pop('order').values
y_val = val.pop('order').values
y_sub = sub.pop('order').values

w_train = train['itemID'].map(weights)
w_full_train = full_train['itemID'].map(weights)
w_val = val['itemID'].map(weights)
w_sub = sub['itemID'].map(weights)

train.pop("itemID")
full_train.pop("itemID")
val.pop("itemID")
sub.pop("itemID")

X_train = train.values
X_full_train = full_train.values
X_val = val.values
X_sub = sub.values

<hr>

## - Min Expected Error

In [45]:
def evaluate(prediction, target, simulationPrice):
    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)

In [46]:
# max expected rmse
from sklearn.metrics import mean_squared_error as mse
# pred = data.loc[1:12].groupby('itemID')['order'].mean().sort_index()
# target_week = data.loc[13:, 'order'].reset_index(level = 0, drop = True).sort_index()
# mse(target_week, pred) ** .5

<hr>

## - XGBoost

In [47]:
import xgboost as xgb

In [48]:
xgb.__version__

'1.1.0'

In [49]:
# custom objective

def gradient(prediction, dtrain):
    y = dtrain.get_label()
#     prediction.astype(int)
#     prediction = np.minimum(prediction.astype(int), 1)
    return -2 * (prediction - np.maximum(prediction - y, 0) * 1.6) * (1 - (prediction > y) * 1.6)

def hessian(prediction, dtrain):
    y = dtrain.get_label()
#     prediction.prediction(int)
#     prediction = np.minimum(prediction.astype(int), 1)
    return -2 * (1 - (prediction > y) * 1.6) ** 2

def objective(prediction, dtrain):
    w = dtrain.get_weight()
    grad = gradient(prediction, dtrain) * w
    hess = hessian(prediction, dtrain) * w
    return grad, hess

In [50]:
# custom feval

def feval(prediction, dtrain):
    prediction = prediction.astype(int)
#     predt = np.minimum(predt.astype(int), 1)
    target = dtrain.get_label()
    simulationPrice = dtrain.get_weight()
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)

In [164]:
missing = 0
dtrain = xgb.DMatrix(X_train, y_train, w_train, missing = missing)
dfulltrain = xgb.DMatrix(X_full_train, y_full_train, w_full_train, missing = missing)
dval = xgb.DMatrix(X_val, y_val, w_val, missing = missing)
dsub = xgb.DMatrix(X_sub, y_sub, w_sub, missing = missing)
# specify parameters via map
param = {
    'max_depth':4,
    'eta':0.005,
    'objective':'reg:squarederror',
    'disable_default_eval_metric': 1,
    "min_child_weight" : 10000,
    
#     'tree_method' : 'gpu_hist',
}
num_round = 800
bst = xgb.train(param, dtrain,
                num_round,
                early_stopping_rounds = 10,
                evals = [(dtrain, 'train'), (dval, 'val')],
#                 obj = objective,
                feval = feval,
                maximize = True,
                )

[0]	train-feval:84890.44004	val-feval:9371.34002
Multiple eval metrics have been passed: 'val-feval' will be used for early stopping.

Will train until val-feval hasn't improved in 10 rounds.
[1]	train-feval:155756.33802	val-feval:16117.52401
[2]	train-feval:362273.85631	val-feval:34828.89000
[3]	train-feval:415741.49434	val-feval:40651.69799
[4]	train-feval:479058.11432	val-feval:46892.81399
[5]	train-feval:571556.77219	val-feval:50726.16200
[6]	train-feval:627300.82426	val-feval:56721.67800
[7]	train-feval:964776.66430	val-feval:80595.54197
[8]	train-feval:1005776.90636	val-feval:85970.26997
[9]	train-feval:1082760.80840	val-feval:93506.47399
[10]	train-feval:1112972.94246	val-feval:95649.81798
[11]	train-feval:1159033.48852	val-feval:101601.98599
[12]	train-feval:1591329.98221	val-feval:125802.30618
[13]	train-feval:1651519.07433	val-feval:132167.72619
[14]	train-feval:1679098.45436	val-feval:134531.08219
[15]	train-feval:1726117.44239	val-feval:140603.93019
[16]	train-feval:1771514

[149]	train-feval:8333098.51718	val-feval:746609.61271
[150]	train-feval:8354335.44322	val-feval:748591.56470
[151]	train-feval:8372571.23118	val-feval:749923.85871
[152]	train-feval:8396954.83312	val-feval:753966.44668
[153]	train-feval:8410277.22115	val-feval:756334.41463
[154]	train-feval:8434520.50317	val-feval:760787.29663
[155]	train-feval:8449380.77514	val-feval:762947.16463
[156]	train-feval:8469969.23913	val-feval:763615.11664
[157]	train-feval:8492069.11516	val-feval:765623.29666
[158]	train-feval:8507451.68727	val-feval:767603.23866
[159]	train-feval:8523936.73130	val-feval:769564.42866
[160]	train-feval:8538633.18721	val-feval:769513.73467
[161]	train-feval:8597468.49338	val-feval:773805.36273
[162]	train-feval:8614017.39541	val-feval:775936.04472
[163]	train-feval:8629978.52949	val-feval:777211.77273
[164]	train-feval:8652126.35948	val-feval:779680.50872
[165]	train-feval:8786635.29334	val-feval:789437.75867
[166]	train-feval:8802657.70320	val-feval:790115.81266
[167]	trai

[297]	train-feval:10490291.02139	val-feval:973435.07889
[298]	train-feval:10493876.33328	val-feval:975046.04489
[299]	train-feval:10495478.03530	val-feval:975273.50289
[300]	train-feval:10498032.07333	val-feval:975554.44689
[301]	train-feval:10503626.76536	val-feval:975947.57289
[302]	train-feval:10512687.85934	val-feval:975440.07688
[303]	train-feval:10518069.38335	val-feval:975664.91286
[304]	train-feval:10525486.34729	val-feval:976178.82686
[305]	train-feval:10525330.44526	val-feval:976393.02286
[306]	train-feval:10532941.36528	val-feval:976527.02286
[307]	train-feval:10540166.50726	val-feval:976845.82086
[308]	train-feval:10577786.23322	val-feval:977575.62486
[309]	train-feval:10583260.19127	val-feval:978199.56486
[310]	train-feval:10587018.88526	val-feval:978389.72086
[311]	train-feval:10596868.95325	val-feval:978674.13287
[312]	train-feval:10604765.94140	val-feval:982392.62694
[313]	train-feval:10611235.05736	val-feval:982453.14095
[314]	train-feval:10614890.68941	val-feval:98308

[442]	train-feval:11010986.70361	val-feval:1033363.83088
[443]	train-feval:11012769.89761	val-feval:1033246.01088
[444]	train-feval:11017461.91368	val-feval:1033777.55086
[445]	train-feval:11020436.36769	val-feval:1034175.87487
[446]	train-feval:11021218.08570	val-feval:1034018.07687
[447]	train-feval:11028365.23352	val-feval:1033683.95087
[448]	train-feval:11029145.36142	val-feval:1034045.63288
[449]	train-feval:11030368.52166	val-feval:1034462.25690
[450]	train-feval:11036753.45146	val-feval:1034438.90291
[451]	train-feval:11038359.03745	val-feval:1034633.02690
[452]	train-feval:11042437.94946	val-feval:1034304.21691
[453]	train-feval:11044274.93949	val-feval:1034454.62491
[454]	train-feval:11050107.09152	val-feval:1034777.86092
[455]	train-feval:11049091.88151	val-feval:1035319.02692
[456]	train-feval:11054396.08352	val-feval:1035312.31092
[457]	train-feval:11053091.13955	val-feval:1035450.72093
[458]	train-feval:11053762.82946	val-feval:1035645.14492
[459]	train-feval:11053530.5734

In [165]:
prediction = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

1156373.3959999997

In [166]:
# retrain!

In [167]:
bst_sub = xgb.train(param, dfulltrain,
    num_boost_round = bst.best_ntree_limit,
    #                 obj = objective,
    feval = feval, maximize = True,
    evals = [(dfulltrain, 'ftrain')],
    verbose_eval = False,
)
bst_sub.best_ntree_limit

496

In [168]:
prediction = bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

1164394.386

In [158]:
# some other things below

In [411]:
# max possible score
evaluate(y_sub, y_sub, w_sub)

7683293.24

In [412]:
# using previous weekpair
evaluate(y_val, y_sub, w_sub)

-3425880.0980000007

In [413]:
submission = items[['itemID']].copy()
submission['demandPrediction'] = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
submission.to_csv('../../submissions/sub_inclass_03.csv', sep = '|', index=False)
# submission.head()

<hr>

## - LGBM

In [56]:
def feval_lgbm(prediction, dtrain):
    prediction = prediction.astype(int)
    target = dtrain.get_label()
    simulationPrice = dtrain.get_weight()
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice), True

In [57]:
data.columns

Index(['weekpair', 'itemID', 'order', 'brand', 'manufacturer',
       'customerRating', 'category1', 'category2', 'category3',
       'recommendedRetailPrice', 'item_1', 'item_2', 'item_4',
       'dist2firstvalueLeak', 'leak_cat3', 'total_new'],
      dtype='object')

In [58]:
list(data.columns).index("dist2firstvalueLeak")

13

In [79]:
import lightgbm as lgb

params = {
          "objective" : 'regression_l1',
#           "metric" :"rmse",
          "learning_rate" : 0.05,
          'verbosity': 2,
#           'max_depth': 6,
#           'num_leaves': 4,
          "min_data_in_leaf":1500
         }
# https://lightgbm.readthedocs.io/en/latest/Parameters.html


ds_params = {
#     'categorical_feature' : [3, 4, 5, 7, list(data.columns).index("dist2firstvalueLeak"),],
}
lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train, **ds_params)
lgbfulltrain = lgb.Dataset(X_full_train, label = y_full_train, weight=w_full_train, **ds_params)
lgbvalid = lgb.Dataset(X_val, label = y_val, weight=w_val, **ds_params)
lgbsubmis = lgb.Dataset(X_sub, label = y_sub, weight=w_sub, **ds_params)

num_round = 1000
lgb_model = lgb.train(params,
                  lgbtrain,
                  num_round,
                  valid_sets = [lgbtrain, lgbvalid],
                  valid_names = ['train', 'val'],
                  verbose_eval=5,
                  early_stopping_rounds=5,
                  feval = feval_lgbm,
#                   fobj = objective,
                 )

Training until validation scores don't improve for 5 rounds
[5]	train's l1: 24.8487	train's feval: 4.29048e+06	val's l1: 48.3218	val's feval: 271200
[10]	train's l1: 24.6151	train's feval: 4.61705e+06	val's l1: 47.8479	val's feval: 304486
[15]	train's l1: 24.2905	train's feval: 4.96764e+06	val's l1: 47.0789	val's feval: 331263
[20]	train's l1: 23.9994	train's feval: 5.34257e+06	val's l1: 46.4069	val's feval: 373530
[25]	train's l1: 23.2281	train's feval: 6.34816e+06	val's l1: 44.0037	val's feval: 507889
[30]	train's l1: 22.9181	train's feval: 6.77466e+06	val's l1: 43.133	val's feval: 560501
[35]	train's l1: 22.5438	train's feval: 7.29709e+06	val's l1: 42.0321	val's feval: 624908
[40]	train's l1: 22.4135	train's feval: 7.4532e+06	val's l1: 41.6251	val's feval: 647617
[45]	train's l1: 22.3849	train's feval: 7.46894e+06	val's l1: 41.6281	val's feval: 646772
Early stopping, best iteration is:
[40]	train's l1: 22.4135	train's feval: 7.4532e+06	val's l1: 41.6251	val's feval: 647617


In [80]:
prediction = lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration).astype(int)
evaluate(prediction, y_sub, w_sub)

694935.078

In [81]:
# retrain!

In [82]:
lgb_model_sub = lgb.train(params,
                  lgbfulltrain,
                  lgb_model.best_iteration,
                  valid_sets = [lgbfulltrain],
                  valid_names = ['train'],
                  verbose_eval=5,
                  early_stopping_rounds=None,
                 feval = feval_lgbm,
#                   fobj = objective,
                 )

[5]	train's l1: 25.8364	train's feval: 4.56168e+06
[10]	train's l1: 25.5888	train's feval: 4.93825e+06
[15]	train's l1: 25.1633	train's feval: 5.49753e+06
[20]	train's l1: 24.6177	train's feval: 6.20721e+06
[25]	train's l1: 23.9284	train's feval: 7.13715e+06
[30]	train's l1: 23.5276	train's feval: 7.65153e+06
[35]	train's l1: 23.1251	train's feval: 8.29038e+06
[40]	train's l1: 22.9386	train's feval: 8.50625e+06


In [83]:
prediction = lgb_model_sub.predict(X_sub, num_iteration=80).astype(int)
evaluate(prediction, y_sub, w_sub)

739717.054

<hr>

## - CatBoost

In [84]:
from catboost import CatBoost, CatBoostRegressor, Pool

In [85]:
class feval_cat(object):
    def get_final_error(self, error, weight):
        # return error / (weight + 1e-38)
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, simulationPrice):
#         global smthing
#         smthing = [approxes, target, simulationPrice]
        prediction = np.array(approxes[0]).astype(int)
        target = np.array(target).astype(int)
        simulationPrice = np.array(simulationPrice)
        score = np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)
#         print('score', score)
#         print(approxes, type(target), type(simulationPrice))
        return score, 0

In [86]:
ds_params = {
#     'cat_features' : [8, 9, 10],
}
train_pool = Pool(X_train, label = y_train, weight = w_train, **ds_params)
trainfull_pool = Pool(X_full_train, label = y_full_train, weight = w_full_train, **ds_params)
val_pool = Pool(X_val, label = y_val, weight = w_val, **ds_params)
sub_pool = Pool(X_sub, label = y_sub, weight = w_sub, **ds_params)


model = CatBoostRegressor(
#     iterations = 2,
    depth=7, 
    learning_rate=0.1, 
    loss_function='MAE',
    early_stopping_rounds=5,
    eval_metric = feval_cat(),
    thread_count=-1,
)

model.fit(
    train_pool,
    eval_set=[train_pool, val_pool],
#     logging_level='Verbose',  # you can uncomment this for text output

);


0:	learn: 4553062.6628147	test: 4553062.6628147	test1: 254199.4882493	best: 254199.4882493 (0)	total: 66.8ms	remaining: 1m 6s
1:	learn: 5080532.9513628	test: 5080532.9513628	test1: 342110.4841304	best: 342110.4841304 (1)	total: 81.7ms	remaining: 40.7s
2:	learn: 5324451.3578394	test: 5324451.3578394	test1: 444157.7189374	best: 444157.7189374 (2)	total: 92.7ms	remaining: 30.8s
3:	learn: 5651841.3145368	test: 5651841.3145368	test1: 493705.1290571	best: 493705.1290571 (3)	total: 104ms	remaining: 25.9s
4:	learn: 5646189.5222374	test: 5646189.5222374	test1: 554045.4149854	best: 554045.4149854 (4)	total: 118ms	remaining: 23.4s
5:	learn: 5905600.7724218	test: 5905600.7724218	test1: 611343.0728264	best: 611343.0728264 (5)	total: 128ms	remaining: 21.3s
6:	learn: 6219821.4223866	test: 6219821.4223866	test1: 654614.1268941	best: 654614.1268941 (6)	total: 140ms	remaining: 19.8s
7:	learn: 6485456.9865360	test: 6485456.9865360	test1: 678919.5469258	best: 678919.5469258 (7)	total: 151ms	remaining: 18.

In [87]:
prediction = model.predict(X_sub, ntree_end = model.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

932333.0

In [88]:
# retrain!

In [89]:
model.best_iteration_

27

In [92]:
cat_sub = CatBoostRegressor(**{**model.get_params(), "iterations" : model.best_iteration_})
cat_sub.fit(
    trainfull_pool,
    eval_set=[trainfull_pool],
#     logging_level='Verbose',  # you can uncomment this for text output

);

0:	learn: 4877921.8366379	test: 4877921.8366379	best: 4877921.8366379 (0)	total: 15.1ms	remaining: 392ms
1:	learn: 5467868.6152376	test: 5467868.6152376	best: 5467868.6152376 (1)	total: 31.2ms	remaining: 390ms
2:	learn: 5820571.7668651	test: 5820571.7668651	best: 5820571.7668651 (2)	total: 46.7ms	remaining: 373ms
3:	learn: 6179459.8011581	test: 6179459.8011581	best: 6179459.8011581 (3)	total: 66.2ms	remaining: 381ms
4:	learn: 6227764.6548927	test: 6227764.6548927	best: 6227764.6548927 (4)	total: 82.1ms	remaining: 361ms
5:	learn: 6541577.2749894	test: 6541577.2749894	best: 6541577.2749894 (5)	total: 93.4ms	remaining: 327ms
6:	learn: 6889170.2589114	test: 6889170.2589114	best: 6889170.2589114 (6)	total: 105ms	remaining: 300ms
7:	learn: 7232641.9293438	test: 7232641.9293438	best: 7232641.9293438 (7)	total: 122ms	remaining: 289ms
8:	learn: 7167455.4202043	test: 7167455.4202043	best: 7232641.9293438 (7)	total: 135ms	remaining: 269ms
9:	learn: 7528826.7488443	test: 7528826.7488443	best: 7528

In [93]:
prediction = cat_sub.predict(X_sub, ntree_end = cat_sub.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

802618.406

<hr>

## - Ensemble

In [97]:
cat_w = .2
lgb_w = .2
xgb_w = 1
ensemble = model.predict(X_sub, ntree_end = model.best_iteration_) * cat_w
ensemble += lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration) * lgb_w
ensemble += bst.predict(dsub, ntree_limit=bst.best_ntree_limit) * xgb_w
ensemble = ensemble / (cat_w + lgb_w + xgb_w)
evaluate(ensemble.astype(int), y_sub, w_sub)

1066921.946

<hr>

## - Linear Regression

In [776]:
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import 

In [777]:
lr = LinearRegression()

In [778]:
lr.fit(X_train, y_train, w_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [780]:
print('train', evaluate(lr.predict(X_train), y_train, w_train))
print('test', evaluate(lr.predict(X_val), y_val, w_val))
print('sub', evaluate(lr.predict(X_sub), y_sub, w_sub))

train -25798195.590995364
test -2082818.3608716822
sub -2279245.550001612


<hr>

In [None]:
# fazer feature que pega o percentil de quando o item deu de dinheiro dentro de uma categoria
# fazer features que pega dist de atual até o pico mais alto
# dist do maior pico pro segundo
# min(dist(terceiro, primeiro), dist(terceiro, segundo))