In [1]:
import pandas as pd
import numpy as np
import utils

from sklearn.model_selection import GroupKFold, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import tweedie


from importlib import reload
reload(utils)
from skopt import gp_minimize

In [2]:
train = pd.read_parquet("./train/0.parquet")
train['date'] = pd.to_datetime(train['date'])
train['fold'] = train['date'].dt.month

In [3]:
train.head()

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,site_id,fold
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2


In [3]:
test = pd.read_csv("test_data.csv", index_col=0).squeeze()

In [4]:
train.dtypes

sku                                int64
date                      datetime64[ns]
sold_quantity                      int64
current_price                    float64
currency                          object
listing_type                      object
shipping_logistic_type            object
shipping_payment                  object
minutes_active                   float64
item_domain_id                    object
site_id                           object
fold                               int64
dtype: object

In [5]:
train.head()

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,site_id,fold
0,464801,2021-02-01,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2
2,464801,2021-02-03,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2
3,464801,2021-02-04,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2
4,464801,2021-02-05,1,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,MLB,2


In [6]:
cats = ['item_domain_id', 'currency', 'listing_type', 'shipping_logistic_type', 'shipping_payment', 'site_id']

In [7]:
from category_encoders import OrdinalEncoder
enc = OrdinalEncoder(cats)
train = enc.fit_transform(train)

In [8]:
def gen_tr_ts():
    for fold in [2,3]:
        ts = train[train['fold'] != fold]['date'].max()
        ts = train[(train['fold'] != fold) & (train['date'] == ts)].index
        yield train.index[train['fold'] == fold], ts, fold

    

In [None]:
def tune(params):
    print(params)
    features = ["current_price", "minutes_active"] + cats

    mean_rps = 0.
    for tr,ts, fold in gen_tr_ts():
        #print(tr.shape, ts.shape)
        X = train[features]
        y = train['sold_quantity']

        Xtr = X.iloc[tr]
        ytr = y.iloc[tr]
        Xval = X.iloc[ts]
        yval = y.iloc[ts]

        #mdl = LinearRegression(normalize=True)
        #mdl = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
        mdl = XGBRegressor(n_estimators=1000, learning_rate=params[0],
                           max_depth=params[1],
                           subsample=params[2],
                           colsample_bytree=params[3],
                           tweedie_variance_power=params[4],
                           min_child_weight=params[5],
                           random_state=0, objective="reg:tweedie", 
                           base_score=1e-3,
                           tree_method='gpu_hist')
        mdl.fit(Xtr, ytr)
        p = mdl.predict(Xval)


        ## EVAL
        pp = train[train['fold'] != fold][['sku', 'date', 'sold_quantity']]
        pp['stock'] = pp['sku'].map(test)
        pp = pp.sort_values(["sku","date"])
        pp['cumulative_y'] = pp.groupby("sku")['sold_quantity'].cumsum()

        pp = pp.dropna(subset=['stock'])
        pp['stockout_y'] = pp['cumulative_y'] >= pp['stock']

        first_so_y = pp[pp['stockout_y']].groupby("sku").first()
        days_to_so_y = (first_so_y["date"] - pp["date"].min()) / np.timedelta64(1, 'D')
        days_to_so_y = days_to_so_y.reindex(pp['sku'].unique()).fillna(30.).clip(1,30)


        ppp = train.iloc[ts][['sku']]
        #p[~np.isfinite(p)] = 17.
        ppp['p'] = p
        ppp['stock'] = ppp['sku'].map(test)
        ppp = ppp.dropna(subset=['stock'])
        ppp['days_to_so'] = (ppp['stock'] / ppp['p']).astype(int).fillna(30.).clip(1,30)
        days_to_so_p = ppp[['sku', 'days_to_so']].set_index("sku").squeeze().reindex(days_to_so_y.index) 

        days_to_so_p2 = utils.pred_list_to_tweedie(days_to_so_p, phi=2, p=1.5)
        
        #tweedie distribution -> [0.05, 0.07, ... .13, 0.12]

        rps = utils.rps(days_to_so_y, days_to_so_p2, probs=True)
        mean_rps += rps
        print(rps)
    return mean_rps / 2

space = [(1e-3, 1e-1, 'log-uniform'),
         (1, 10),
         (0.05, 0.95),
         (0.05, 0.95),
         (1.0,1.99),
         (1,300)]
res = gp_minimize(tune, space, random_state=1, verbose=1)

    

Iteration No: 1 started. Evaluating function at random point.
[0.09871192514273254, 9, 0.16531200313642108, 0.9491364637917304, 1.2337280871824563, 120]
8.619976237016095
8.863340114023996
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 164.3399
Function value obtained: 8.7417
Current minimum: 8.7417
Iteration No: 2 started. Evaluating function at random point.
[0.0059678992438367785, 7, 0.8919851637254288, 0.8116798250174155, 1.3101407817629525, 158]
8.969453535454983
9.180772268605745
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 161.5522
Function value obtained: 9.0751
Current minimum: 8.7417
Iteration No: 3 started. Evaluating function at random point.
[0.007707362534461022, 3, 0.5309725180523154, 0.8725658221213098, 1.4526327599071185, 130]
9.148086951536671


In [None]:
# 15a 
# Mean CV 9.0805
# LB 6.2598


Iteration No: 2 started. Evaluating function at random point.
[0.003936128001463711, 2, 0.29539066512210194, 0.47989860558921493, 1.8040470414877383, 145]
6.131413939395725
6.4664243315180086
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 91.9157
Function value obtained: 6.2989
Current minimum: 6.2989

# sub

In [10]:
test_df = train[train['date'] == "2021-03-31"]
test_df = test_df[test_df['sku'].isin(test.index)]
print(np.all(test_df['sku'] == test.index))

features = ["current_price", "minutes_active"] + cats
params = [0.003936128001463711, 2, 0.29539066512210194, 0.47989860558921493, 1.8040470414877383, 145]
mdl = XGBRegressor(n_estimators=1000, learning_rate=params[0],
                   max_depth=params[1],
                   subsample=params[2],
                   colsample_bytree=params[3],
                   tweedie_variance_power=params[4],
                   min_child_weight=params[5],
                   random_state=0, objective="reg:tweedie", 
                   base_score=1e-3,
                   tree_method='gpu_hist')
mdl.fit(train[features], train['sold_quantity'])
p = mdl.predict(test_df[features])

True


In [14]:
spp = test_df[['sku']].copy()
spp['p'] = p
spp['stock'] = spp['sku'].map(test)
spp['days_to_so'] = (spp['stock'] / spp['p']).fillna(30.).clip(1,30).astype(int)


In [15]:
test.index.isin(spp['sku']).mean()

1.0

In [16]:
prob_array = utils.pred_list_to_tweedie(spp['days_to_so'].values, phi=2., p=1.5)
pd.set_option("display.max_columns", 31)
pd.DataFrame(prob_array).round(4).to_csv("19.csv.gz", header=False, index=False, compression="gzip")

In [18]:
pd.read_csv("19.csv.gz",header=None)#.sum(axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,0.0215,0.0145,0.0175,0.0204,0.0233,0.0259,0.0284,0.0307,0.0328,0.0346,0.0362,0.0375,0.0386,0.0395,0.0401,0.0405,0.0407,0.0407,0.0405,0.0402,0.0397,0.0390,0.0383,0.0374,0.0364,0.0354,0.0343,0.0331,0.0319,0.0307
1,0.0161,0.0111,0.0137,0.0162,0.0188,0.0213,0.0238,0.0261,0.0284,0.0305,0.0325,0.0343,0.0359,0.0373,0.0386,0.0397,0.0406,0.0413,0.0418,0.0422,0.0424,0.0424,0.0423,0.0421,0.0417,0.0412,0.0406,0.0399,0.0391,0.0383
2,0.0161,0.0111,0.0137,0.0162,0.0188,0.0213,0.0238,0.0261,0.0284,0.0305,0.0325,0.0343,0.0359,0.0373,0.0386,0.0397,0.0406,0.0413,0.0418,0.0422,0.0424,0.0424,0.0423,0.0421,0.0417,0.0412,0.0406,0.0399,0.0391,0.0383
3,0.0161,0.0111,0.0137,0.0162,0.0188,0.0213,0.0238,0.0261,0.0284,0.0305,0.0325,0.0343,0.0359,0.0373,0.0386,0.0397,0.0406,0.0413,0.0418,0.0422,0.0424,0.0424,0.0423,0.0421,0.0417,0.0412,0.0406,0.0399,0.0391,0.0383
4,0.0161,0.0111,0.0137,0.0162,0.0188,0.0213,0.0238,0.0261,0.0284,0.0305,0.0325,0.0343,0.0359,0.0373,0.0386,0.0397,0.0406,0.0413,0.0418,0.0422,0.0424,0.0424,0.0423,0.0421,0.0417,0.0412,0.0406,0.0399,0.0391,0.0383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551467,0.0161,0.0111,0.0137,0.0162,0.0188,0.0213,0.0238,0.0261,0.0284,0.0305,0.0325,0.0343,0.0359,0.0373,0.0386,0.0397,0.0406,0.0413,0.0418,0.0422,0.0424,0.0424,0.0423,0.0421,0.0417,0.0412,0.0406,0.0399,0.0391,0.0383
551468,0.0161,0.0111,0.0137,0.0162,0.0188,0.0213,0.0238,0.0261,0.0284,0.0305,0.0325,0.0343,0.0359,0.0373,0.0386,0.0397,0.0406,0.0413,0.0418,0.0422,0.0424,0.0424,0.0423,0.0421,0.0417,0.0412,0.0406,0.0399,0.0391,0.0383
551469,0.0161,0.0111,0.0137,0.0162,0.0188,0.0213,0.0238,0.0261,0.0284,0.0305,0.0325,0.0343,0.0359,0.0373,0.0386,0.0397,0.0406,0.0413,0.0418,0.0422,0.0424,0.0424,0.0423,0.0421,0.0417,0.0412,0.0406,0.0399,0.0391,0.0383
551470,0.0399,0.0256,0.0296,0.0331,0.0361,0.0386,0.0405,0.0419,0.0429,0.0434,0.0435,0.0432,0.0427,0.0418,0.0407,0.0394,0.0379,0.0364,0.0347,0.0330,0.0312,0.0294,0.0276,0.0259,0.0242,0.0225,0.0209,0.0193,0.0178,0.0164
