In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
infos = pd.read_csv('../../data/infos.csv', sep = '|')

In [3]:
items = pd.read_csv('../../data/items.csv', sep = '|')

In [4]:
orders = pd.read_csv('../../data/orders.csv', sep = '|', parse_dates=['time'])

## - Creating the structure

In [5]:
df = orders.copy()

In [6]:
df['weekpair'] = (df.time.dt.dayofyear + 1) // 14 - 13

In [7]:
npairs = df.weekpair.nunique()

In [8]:
n_items = items['itemID'].nunique()
print('total number of items:', n_items)
print('expected number of instances:', n_items * npairs)

total number of items: 10463
expected number of instances: 136019


In [9]:
mi = pd.MultiIndex.from_product([range(-npairs, 0), items['itemID']], names=['weekpair', 'itemID'])
data_temp = pd.DataFrame(index = mi)

In [10]:
data_temp = data_temp.join(df.groupby(['weekpair', 'itemID'])[['order']].sum(), how = 'left')

In [11]:
data_temp.fillna(0, inplace = True)

In [12]:
data_temp.groupby('itemID').count().min()

order    13
dtype: int64

In [13]:
# data_temp

## - Creating features

In [134]:
items.head(2)

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,1,0,1,4.38,1,1,1,8.84
1,2,0,2,3.0,1,2,1,16.92


In [136]:
# features = [
#     ('itemID', 'item'),
#     ('manufacturer', 'manuf'),
#     ('category1', 'cat1'),
#     ('category2', 'cat2'),
#     ('category3', 'cat3')
# ]
# for f, n in features:
#     if f not in data.columns:
#         print('ops', f)

In [137]:
# features = [('itemID', 'item')]

In [138]:
# # f, name = ('manufacturer', 'manuf')
# for f, name in features:
#     print(f)
#     temp = data.groupby([f, 'weekpair'])[['order']].sum()
#     shifted = temp.groupby(f)[['order']].shift(1)
#     new_feature_block = pd.DataFrame()
#     for n in range(3):
#         rolled = shifted.groupby(f, as_index = False)['order'].rolling(2 ** n).mean()
#         new_feature_block['%s_%d' % (name, 2 ** n)] = rolled.reset_index(0, drop = True) # rolling has a weird index behavior...
#     data = pd.merge(data, new_feature_block.reset_index(), on = [f, 'weekpair'])

In [139]:
def gbagg(data, group_cols, targeted_cols, out_names, function, as_index = False):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = []
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval)
        new_feat.append(output)

        # go to next group
        i = j
    
    idx = data.groupby(group_cols).size().index # this is actually fast...
    out_df = pd.DataFrame(new_feat, columns = out_names, index = idx)
        
    if not as_index:
        out_df.reset_index(inplace = True)
        
    return out_df

In [140]:
def gbtransf(data, group_cols, targeted_cols, out_names, function, params = dict()):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = np.zeros((len(data), len(out_names)))
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval, **params)
        new_feat[i:j] = output

        # go to next group
        i = j
    
    out_df = pd.DataFrame(new_feat, columns = out_names, index = data.index)
        
    return out_df

In [141]:
def shift_and_2n_window(x, ws):
#     out = pd.DataFrame(x)
#     out = out.shift()
#     out = out.rolling(2 ** n).mean()

    shifted = np.zeros_like(x) # output
    shifted[1:] = x[:-1] # shift
    out = np.zeros_like(x, dtype = float)
    
    # rolling mean
    total = shifted[:ws].sum()
    out[ws - 1] = total / ws
    for i in range(ws, len(out)):
        total = total - shifted[i - ws] + shifted[i]
        out[i] = total / ws
    out[:ws] = np.NaN # maybe ws -1 should be NaN as well for receiving one NaN value when ws > 1
    # out[0] = np.NaN # this is always NaN for a shift of 1
    return out

In [196]:
data = data_temp.reset_index()
data = pd.merge(data, items, on = 'itemID')

In [197]:
data.sort_values(['itemID', 'weekpair'], inplace = True)

In [198]:
# gbtransf(data, ['itemID', 'weekpair'], ['order'], ['out'], lambda x : np.ones_like(x))

In [199]:
shift_and_2n_window(np.array([1 , 2, 3, 4, 5, 6]), 2 ** 1)

array([nan, nan, 1.5, 2.5, 3.5, 4.5])

In [200]:
features = [('itemID', 'item')]

In [201]:
for f, name in features:
    print(f)
    new_feature_block = pd.DataFrame()
    for n in range(3):
        new_f = gbtransf(data, ['itemID'], ['order'], ['out'], shift_and_2n_window, {'ws' : 2 ** n})
        new_feature_block['%s_%d' % (name, 2 ** n)] = new_f['out']
#     data = pd.merge(data, new_feature_block.reset_index(), on = [f, 'weekpair'])
    data = pd.concat([data, new_feature_block], axis =  1)

itemID


In [202]:
data.count() # the larger the window, more NaN are expected

weekpair                  136019
itemID                    136019
order                     136019
brand                     136019
manufacturer              136019
customerRating            136019
category1                 136019
category2                 136019
category3                 136019
recommendedRetailPrice    136019
item_1                    125556
item_2                    115093
item_4                     94167
dtype: int64

In [203]:
def dist2firstvalue(x):
    out = np.zeros_like(x, dtype = float)
    first = np.NaN
    for i in range(len(x)):
        out[i] = first
        if x[i] != 0:
            first = i
            break
    if i == len(x) - 1:
        return out
    for j in range(int(first), len(x)):
        out[j] = j - first
        
    return out

In [204]:
dist2firstvalue(np.array([0 , 0, 0, 0])), dist2firstvalue(np.array([0 , 0, 3, 0, 5, 6]))

(array([nan, nan, nan, nan]), array([nan, nan,  0.,  1.,  2.,  3.]))

In [241]:
def dist2firstvalueLeak(x):
    out = np.zeros_like(x, dtype = float)
    for i in range(len(x)):
        if x[i] != 0:
            out[i] = 1
            break
#         else:
#             out[i] = -9999
        
    return out

# def dist2firstvalueLeakAdding(x):
#     out = np.zeros_like(x, dtype = float)
#     d = 1
#     for i in range(len(x)):
#         if x[i] != 0:
#             out[i] = d
#             break
#     d += 1
#     for j in range(i + 1, len(x)):
#         out[j] = d
#         d += 1
#     return out

In [242]:
dist2firstvalueLeak(np.array([0 , 0, 3, 0, 5, 6]))
# dist2firstvalue(np.array([0 , 0, 0, 0]))

array([0., 0., 1., 0., 0., 0.])

In [243]:
# def dist2lastpeak(x):
#     out = np.zeros_like(x, dtype = float)
#     peak = np.NaN
#     peak_val = 0
#     for i in range(0, len(x)):
#         out[i] = i - peak
#         if x[i] > peak_val:
#             peak = i
#             peak_val = x[i]
        
#     return out

In [244]:
# dist2lastpeak(np.array([0 , 0, 3, 0, 5, 6]))

In [245]:
data.sort_values(['itemID', 'weekpair'], inplace = True)

In [246]:
data['dist2firstvalueLeak'] = gbtransf(data, ['itemID'], ['order'], ['out'], dist2firstvalueLeak)['out']
# data['dist2lastpeak'] = gbtransf(data, ['itemID'], ['order'], ['out'], dist2lastpeak)['out']

In [247]:
data.groupby("weekpair")["dist2firstvalueLeak"].sum().to_dict()

{-13: 1431.0,
 -12: 729.0,
 -11: 371.0,
 -10: 533.0,
 -9: 785.0,
 -8: 909.0,
 -7: 716.0,
 -6: 661.0,
 -5: 785.0,
 -4: 671.0,
 -3: 794.0,
 -2: 727.0,
 -1: 728.0}

In [248]:
# del data["leak_cat3"]

In [249]:
the_cat = "manufacturer"

In [250]:
sla = data.groupby(["weekpair", the_cat])["dist2firstvalueLeak"].sum().reset_index()

In [251]:
sla = sla.rename(columns={"dist2firstvalueLeak" : "leak_cat3"})

In [252]:
# acho que mudou nada... opa, mudou sim
data = pd.merge(data, sla, on = ["weekpair", the_cat])

In [253]:
# the_cat = "brand"

In [254]:
# sla = data.groupby(["weekpair", the_cat])["dist2firstvalueLeak"].sum().reset_index()

In [255]:
# sla = sla.rename(columns={"dist2firstvalueLeak" : "leak_cat4"})

In [256]:
# del data["leak_cat4"]

In [257]:
# acho que mudou nada... opa, mudou sim
# data = pd.merge(data, sla, on = ["weekpair", the_cat])

In [258]:
data["total_new"] = data["weekpair"].map(data.groupby("weekpair")["dist2firstvalueLeak"].sum().to_dict())

In [259]:
data.fillna(0, inplace=True)

In [260]:
# checking if we got what we wanted
data.query('itemID == 1')

Unnamed: 0,weekpair,itemID,order,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,item_1,item_2,item_4,dist2firstvalueLeak,leak_cat3,total_new
0,-13,1,0.0,0,1,4.38,1,1,1,8.84,0.0,0.0,0.0,0.0,5.0,1431.0
40,-12,1,2.0,0,1,4.38,1,1,1,8.84,0.0,0.0,0.0,1.0,4.0,729.0
80,-11,1,313.0,0,1,4.38,1,1,1,8.84,2.0,1.0,0.0,0.0,6.0,371.0
120,-10,1,35.0,0,1,4.38,1,1,1,8.84,313.0,157.5,0.0,0.0,3.0,533.0
160,-9,1,3.0,0,1,4.38,1,1,1,8.84,35.0,174.0,87.5,0.0,0.0,785.0
200,-8,1,1.0,0,1,4.38,1,1,1,8.84,3.0,19.0,88.25,0.0,1.0,909.0
240,-7,1,1.0,0,1,4.38,1,1,1,8.84,1.0,2.0,88.0,0.0,0.0,716.0
280,-6,1,2.0,0,1,4.38,1,1,1,8.84,1.0,1.0,10.0,0.0,1.0,661.0
320,-5,1,299.0,0,1,4.38,1,1,1,8.84,2.0,1.5,1.75,0.0,12.0,785.0
360,-4,1,3.0,0,1,4.38,1,1,1,8.84,299.0,150.5,75.75,0.0,1.0,671.0


In [261]:
# data['weekswithtrans'] = data.groupby('itemID')['order'].apply(lambda x : (x > 0).cumsum()) / (data['weekpair'] + 14)

In [262]:
data.head()

Unnamed: 0,weekpair,itemID,order,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,item_1,item_2,item_4,dist2firstvalueLeak,leak_cat3,total_new
0,-13,1,0.0,0,1,4.38,1,1,1,8.84,0.0,0.0,0.0,0.0,5.0,1431.0
1,-13,31,0.0,0,1,5.0,1,1,1,18.87,0.0,0.0,0.0,0.0,5.0,1431.0
2,-13,32,1.0,0,1,4.6,1,1,1,26.49,0.0,0.0,0.0,1.0,5.0,1431.0
3,-13,37,1.0,0,1,5.0,1,1,1,17.1,0.0,0.0,0.0,1.0,5.0,1431.0
4,-13,108,0.0,0,1,5.0,1,1,1,13.35,0.0,0.0,0.0,0.0,5.0,1431.0


##  - Split Data

In [318]:
weights = infos.set_index('itemID')['simulationPrice'].to_dict()

In [313]:
filtered_data = data
# filtered_data = data.query("dist2firstvalueLeak != 1")

In [314]:
len(data), len(filtered_data)

(136019, 126179)

In [315]:
# filtered_data.pop("itemID");

In [347]:
sub_week = -1
train = filtered_data.query('-13 <= weekpair <= (@sub_week - 2)').reset_index(drop = True)
full_train = filtered_data.query('-13 <= weekpair <= (@sub_week - 1)').reset_index(drop = True)
val = filtered_data.query('weekpair == (@sub_week - 1)').reset_index(drop = True)
sub = filtered_data.query('weekpair == (@sub_week)').reset_index(drop = True)

In [348]:
len(train), len(val), len(sub)

(106708, 9736, 9735)

In [349]:
y_train = train.pop('order').values
y_full_train = full_train.pop('order').values
y_val = val.pop('order').values
y_sub = sub.pop('order').values

w_train = train['itemID'].map(weights)
w_full_train = full_train['itemID'].map(weights)
w_val = val['itemID'].map(weights)
w_sub = sub['itemID'].map(weights)

# train.pop("itemID")
# full_train.pop("itemID")
# val.pop("itemID")
# sub.pop("itemID")

X_train = train.values
X_full_train = full_train.values
X_val = val.values
X_sub = sub.values

<hr>

## - Min Expected Error

In [350]:
def evaluate(prediction, target, simulationPrice):
    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)

In [351]:
# max expected rmse
from sklearn.metrics import mean_squared_error as mse
# pred = data.loc[1:12].groupby('itemID')['order'].mean().sort_index()
# target_week = data.loc[13:, 'order'].reset_index(level = 0, drop = True).sort_index()
# mse(target_week, pred) ** .5

<hr>

## - XGBoost

In [352]:
import xgboost as xgb

In [353]:
xgb.__version__

'1.1.0'

In [354]:
# custom objective

def gradient(prediction, dtrain):
    y = dtrain.get_label()
#     prediction.astype(int)
#     prediction = np.minimum(prediction.astype(int), 1)
    return -2 * (prediction - np.maximum(prediction - y, 0) * 1.6) * (1 - (prediction > y) * 1.6)

def hessian(prediction, dtrain):
    y = dtrain.get_label()
#     prediction.prediction(int)
#     prediction = np.minimum(prediction.astype(int), 1)
    return -2 * (1 - (prediction > y) * 1.6) ** 2

def objective(prediction, dtrain):
    w = dtrain.get_weight()
    grad = gradient(prediction, dtrain) * w
    hess = hessian(prediction, dtrain) * w
    return grad, hess

In [355]:
# custom feval

def feval(prediction, dtrain):
    prediction = prediction.astype(int)
#     predt = np.minimum(predt.astype(int), 1)
    target = dtrain.get_label()
    simulationPrice = dtrain.get_weight()
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)

In [356]:
missing = 0
dtrain = xgb.DMatrix(X_train, y_train, w_train, missing = missing)
dfulltrain = xgb.DMatrix(X_full_train, y_full_train, w_full_train, missing = missing)
dval = xgb.DMatrix(X_val, y_val, w_val, missing = missing)
dsub = xgb.DMatrix(X_sub, y_sub, w_sub, missing = missing)
# specify parameters via map
param = {
    'max_depth':10,
    'eta':0.005,
    'objective':'reg:squarederror',
    'disable_default_eval_metric': 1,
    "min_child_weight" : 3,
    
#     'tree_method' : 'gpu_hist',
}
num_round = 400
bst = xgb.train(param, dtrain,
                num_round,
                early_stopping_rounds = 10,
                evals = [(dtrain, 'train'), (dval, 'val')],
#                 obj = objective,
                feval = feval,
                maximize = True,
                )

[0]	train-feval:43827.43995	val-feval:6173.05198
Multiple eval metrics have been passed: 'val-feval' will be used for early stopping.

Will train until val-feval hasn't improved in 10 rounds.
[1]	train-feval:113892.57589	val-feval:13750.35994
[2]	train-feval:183502.97196	val-feval:18850.69190
[3]	train-feval:233343.52596	val-feval:24478.20785
[4]	train-feval:301891.86390	val-feval:31547.15586
[5]	train-feval:341252.15190	val-feval:33242.41784
[6]	train-feval:444336.66796	val-feval:42147.77382
[7]	train-feval:505619.70806	val-feval:47695.01181
[8]	train-feval:568699.63618	val-feval:49088.44376
[9]	train-feval:633998.81220	val-feval:58583.93575
[10]	train-feval:696674.41210	val-feval:57834.23171
[11]	train-feval:745174.00810	val-feval:59866.12766
[12]	train-feval:787895.56825	val-feval:60913.23170
[13]	train-feval:835154.59032	val-feval:62984.05768
[14]	train-feval:861769.50625	val-feval:58899.59970
[15]	train-feval:908923.27222	val-feval:62376.56573
[16]	train-feval:946315.95219	val-fev

In [357]:
prediction = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

101305.85199999996

In [358]:
# retrain!

In [359]:
bst_sub = xgb.train(param, dfulltrain,
    num_boost_round = bst.best_ntree_limit,
    #                 obj = objective,
    feval = feval, maximize = True,
    evals = [(dfulltrain, 'ftrain')],
    verbose_eval = False,
)
bst_sub.best_ntree_limit

28

In [360]:
prediction = bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

83873.15399999995

In [410]:
# some other things below

In [411]:
# max possible score
evaluate(y_sub, y_sub, w_sub)

7683293.24

In [412]:
# using previous weekpair
evaluate(y_val, y_sub, w_sub)

-3425880.0980000007

In [413]:
submission = items[['itemID']].copy()
submission['demandPrediction'] = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
submission.to_csv('../../submissions/sub_inclass_03.csv', sep = '|', index=False)
# submission.head()

<hr>

## - LGBM

In [643]:
def feval_lgbm(prediction, dtrain):
    prediction = prediction.astype(int)
    target = dtrain.get_label()
    simulationPrice = dtrain.get_weight()
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice), True

In [799]:
data.columns

Index(['weekpair', 'itemID', 'order', 'brand', 'manufacturer',
       'customerRating', 'category1', 'category2', 'category3',
       'recommendedRetailPrice', 'item_1', 'item_2', 'item_4',
       'dist2firstvalueLeak', 'leak_cat3', 'total_new'],
      dtype='object')

In [796]:
list(data.columns).index("dist2firstvalueLeak")

13

In [802]:
import lightgbm as lgb

params = {
          "objective" : 'regression_l1',
#           "metric" :"rmse",
          "learning_rate" : 0.05,
          'verbosity': 2,
#           'max_depth': 6,
#           'num_leaves': 15,
          "min_data_in_leaf":1500
         }
# https://lightgbm.readthedocs.io/en/latest/Parameters.html


ds_params = {
#     'categorical_feature' : [3, 4, 5, 7, list(data.columns).index("dist2firstvalueLeak"),],
}
lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train, **ds_params)
lgbfulltrain = lgb.Dataset(X_full_train, label = y_full_train, weight=w_full_train, **ds_params)
lgbvalid = lgb.Dataset(X_val, label = y_val, weight=w_val, **ds_params)
lgbsubmis = lgb.Dataset(X_sub, label = y_sub, weight=w_sub, **ds_params)

num_round = 1000
lgb_model = lgb.train(params,
                  lgbtrain,
                  num_round,
                  valid_sets = [lgbtrain, lgbvalid],
                  valid_names = ['train', 'val'],
                  verbose_eval=5,
                  early_stopping_rounds=5,
                  feval = feval_lgbm,
#                   fobj = objective,
                 )

Training until validation scores don't improve for 5 rounds
[5]	train's l2: 1464.75	train's feval: -1.32057e+07	val's l2: 1858.36	val's feval: -1.01674e+06
[10]	train's l2: 1394.43	train's feval: -8.75271e+06	val's l2: 1778.31	val's feval: -708249
[15]	train's l2: 1348.92	train's feval: -6.43177e+06	val's l2: 1730.32	val's feval: -384746
[20]	train's l2: 1317.45	train's feval: -2.51634e+06	val's l2: 1699.24	val's feval: -260048
[25]	train's l2: 1296.63	train's feval: -1.63339e+06	val's l2: 1680.72	val's feval: -90904.3
[30]	train's l2: 1279.16	train's feval: -317393	val's l2: 1667.82	val's feval: -52188.8
[35]	train's l2: 1265.51	train's feval: 1.96098e+06	val's l2: 1658.66	val's feval: 54488.2
[40]	train's l2: 1256.37	train's feval: 2.95818e+06	val's l2: 1653.26	val's feval: 155709
[45]	train's l2: 1248.77	train's feval: 3.32183e+06	val's l2: 1650.56	val's feval: 201968
[50]	train's l2: 1242.02	train's feval: 3.28291e+06	val's l2: 1646.69	val's feval: 169352
Early stopping, best itera

In [803]:
prediction = lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration).astype(int)
evaluate(prediction, y_sub, w_sub)

324378.17599999957

In [646]:
# retrain!

In [647]:
lgb_model_sub = lgb.train(params,
                  lgbfulltrain,
                  lgb_model.best_iteration,
                  valid_sets = [lgbfulltrain],
                  valid_names = ['train'],
                  verbose_eval=5,
                  early_stopping_rounds=None,
                 feval = feval_lgbm,
#                   fobj = objective,
                 )

[5]	train's l1: 4.73405	train's feval: 1.18954e+06
[10]	train's l1: 4.66981	train's feval: 2.33127e+06
[15]	train's l1: 4.62713	train's feval: 3.06989e+06
[20]	train's l1: 4.58673	train's feval: 3.73909e+06
[25]	train's l1: 4.56596	train's feval: 4.30633e+06
[30]	train's l1: 4.54644	train's feval: 4.59851e+06


In [648]:
prediction = lgb_model_sub.predict(X_sub, num_iteration=80).astype(int)
evaluate(prediction, y_sub, w_sub)

377472.03800000006

<hr>

## - CatBoost

In [117]:
from catboost import CatBoost, CatBoostRegressor, Pool

In [118]:
smthing =0

In [119]:
class feval_cat(object):
    def get_final_error(self, error, weight):
        # return error / (weight + 1e-38)
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, simulationPrice):
#         global smthing
#         smthing = [approxes, target, simulationPrice]
        prediction = np.array(approxes[0]).astype(int)
        target = np.array(target).astype(int)
        simulationPrice = np.array(simulationPrice)
        score = np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)
#         print('score', score)
#         print(approxes, type(target), type(simulationPrice))
        return score, 0

In [121]:
ds_params = {
#     'cat_features' : [8, 9, 10],
}
train_pool = Pool(X_train, label = y_train, weight = w_train, **ds_params)
trainfull_pool = Pool(X_full_train, label = y_full_train, weight = w_full_train, **ds_params)
val_pool = Pool(X_val, label = y_val, weight = w_val, **ds_params)
sub_pool = Pool(X_sub, label = y_sub, weight = w_sub, **ds_params)


model = CatBoostRegressor(
#     iterations = 2,
    depth=7, 
    learning_rate=0.1, 
    loss_function='MAE',
    early_stopping_rounds=5,
    eval_metric = feval_cat(),
    thread_count=-1,
)

model.fit(
    train_pool,
    eval_set=[train_pool, val_pool],
#     logging_level='Verbose',  # you can uncomment this for text output

);


0:	learn: 4448041.7662271	test: 4448041.7662271	test1: 305858.8646799	best: 305858.8646799 (0)	total: 64.9ms	remaining: 1m 4s
1:	learn: 4797594.0345867	test: 4797594.0345867	test1: 391382.2791724	best: 391382.2791724 (1)	total: 82.2ms	remaining: 41s
2:	learn: 5241880.2328963	test: 5241880.2328963	test1: 450040.9890731	best: 450040.9890731 (2)	total: 97.9ms	remaining: 32.5s
3:	learn: 5656339.7249157	test: 5656339.7249157	test1: 478154.2090763	best: 478154.2090763 (3)	total: 109ms	remaining: 27.1s
4:	learn: 5829988.9831609	test: 5829988.9831609	test1: 542399.8853838	best: 542399.8853838 (4)	total: 120ms	remaining: 23.8s
5:	learn: 6130978.2677783	test: 6130978.2677783	test1: 590502.4434476	best: 590502.4434476 (5)	total: 131ms	remaining: 21.6s
6:	learn: 6145374.0235386	test: 6145374.0235386	test1: 659497.9398484	best: 659497.9398484 (6)	total: 142ms	remaining: 20.1s
7:	learn: 6347476.0411946	test: 6347476.0411946	test1: 675588.8138203	best: 675588.8138203 (7)	total: 153ms	remaining: 19s
8

In [122]:
prediction = model.predict(X_sub, ntree_end = model.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

849506.584

In [123]:
# retrain!

In [124]:
model.best_iteration_

37

In [125]:
{**model.get_params(), "iterations" : model.best_iteration_}

{'learning_rate': 0.1,
 'depth': 7,
 'loss_function': 'MAE',
 'eval_metric': <__main__.feval_cat at 0x7f9be0177f90>,
 'early_stopping_rounds': 5,
 'iterations': 37}

In [126]:
cat_sub = CatBoostRegressor(**{**model.get_params(), "iterations" : model.best_iteration_})
cat_sub.fit(
    trainfull_pool,
    eval_set=[trainfull_pool],
#     logging_level='Verbose',  # you can uncomment this for text output

);

0:	learn: 4721346.8613232	test: 4721346.8613232	best: 4721346.8613232 (0)	total: 18.2ms	remaining: 654ms
1:	learn: 5151476.1550159	test: 5151476.1550159	best: 5151476.1550159 (1)	total: 39.7ms	remaining: 695ms
2:	learn: 5676616.5286998	test: 5676616.5286998	best: 5676616.5286998 (2)	total: 62.4ms	remaining: 707ms
3:	learn: 6182982.8467783	test: 6182982.8467783	best: 6182982.8467783 (3)	total: 80.7ms	remaining: 666ms
4:	learn: 6283733.3521879	test: 6283733.3521879	best: 6283733.3521879 (4)	total: 92.7ms	remaining: 593ms
5:	learn: 6710190.4723038	test: 6710190.4723038	best: 6710190.4723038 (5)	total: 104ms	remaining: 539ms
6:	learn: 6794799.7482211	test: 6794799.7482211	best: 6794799.7482211 (6)	total: 116ms	remaining: 496ms
7:	learn: 7010547.3314047	test: 7010547.3314047	best: 7010547.3314047 (7)	total: 127ms	remaining: 461ms
8:	learn: 7363507.9560617	test: 7363507.9560617	best: 7363507.9560617 (8)	total: 140ms	remaining: 434ms
9:	learn: 7548411.8307009	test: 7548411.8307009	best: 75484

In [127]:
prediction = cat_sub.predict(X_sub, ntree_end = cat_sub.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

754340.7719999999

<hr>

## - Ensemble

In [775]:
cat_w = 1
lgb_w = 1
xgb_w = 1
ensemble = model.predict(X_sub, ntree_end = model.best_iteration_) * cat_w
ensemble += lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration) * lgb_w
ensemble += bst.predict(dsub, ntree_limit=bst.best_ntree_limit) * xgb_w
ensemble = ensemble / (cat_w + lgb_w + xgb_w)
evaluate(ensemble.astype(int), y_sub, w_sub)

681241.5599999999

<hr>

## - Linear Regression

In [776]:
from sklearn.linear_model import LinearRegression
# from sklearn.metrics import 

In [777]:
lr = LinearRegression()

In [778]:
lr.fit(X_train, y_train, w_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [780]:
print('train', evaluate(lr.predict(X_train), y_train, w_train))
print('test', evaluate(lr.predict(X_val), y_val, w_val))
print('sub', evaluate(lr.predict(X_sub), y_sub, w_sub))

train -25798195.590995364
test -2082818.3608716822
sub -2279245.550001612


<hr>

In [None]:
# fazer feature que pega o percentil de quando o item deu de dinheiro dentro de uma categoria
# fazer features que pega dist de atual até o pico mais alto
# dist do maior pico pro segundo
# min(dist(terceiro, primeiro), dist(terceiro, segundo))