In [9]:
URL_calendar = "https://slavadatasets.s3.us-east-2.amazonaws.com/calendar.csv"
URL_sales_train ='https://slavadatasets.s3.us-east-2.amazonaws.com/sales_train_validation.csv'
URL_prices = 'https://slavadatasets.s3.us-east-2.amazonaws.com/sell_prices.csv' 

In [1]:
from  datetime import datetime, timedelta
import numpy as np, pandas as pd
import gc
import io
import dask.dataframe as dd
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [3]:
pd.options.display.max_columns = 50

In [4]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
     
    prices = dd.read_csv(URL_prices,dtype = PRICE_DTYPES).compute()
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
        
    cal = dd.read_csv(URL_calendar,dtype = CAL_DTYPES).compute()
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = dd.read_csv(URL_sales_train, 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype).compute()
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [5]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [6]:
h = 28 
max_lags = 57
tr_last = 1913
fday = datetime(2016,4, 25) 
fday

datetime.datetime(2016, 4, 25, 0, 0)

In [7]:
# If you want to load all the data set it to '1' -->  Great  memory overflow  risk ! default= 350
FIRST_DAY = 350

In [10]:
df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape

(40718219, 22)

In [11]:
create_fea(df)
df.shape

(40718219, 31)

In [12]:
df.dropna(inplace = True)
df.shape

(39041269, 31)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39041269 entries, 869062 to 40718218
Data columns (total 31 columns):
id              object
item_id         int16
dept_id         int16
store_id        int16
cat_id          int16
state_id        int16
d               object
sales           float32
date            datetime64[ns]
wm_yr_wk        int16
weekday         int16
wday            int16
month           int16
year            int16
event_name_1    int16
event_type_1    int16
event_name_2    int16
event_type_2    int16
snap_CA         float32
snap_TX         float32
snap_WI         float32
sell_price      float32
lag_7           float32
lag_28          float32
rmean_7_7       float32
rmean_28_7      float32
rmean_7_28      float32
rmean_28_28     float32
week            int16
quarter         int16
mday            int16
dtypes: datetime64[ns](1), float32(11), int16(17), object(2)
memory usage: 4.0+ GB


In [14]:
cat_feats = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI',  'week','quarter', 'mday']

In [15]:
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X = df[train_cols]
y = df["sales"]

In [16]:
del df; gc.collect()

0

In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39041269 entries, 869062 to 40718218
Data columns (total 25 columns):
item_id         int16
dept_id         int16
store_id        int16
cat_id          int16
state_id        int16
wday            int16
month           int16
year            int16
event_name_1    int16
event_type_1    int16
event_name_2    int16
event_type_2    int16
snap_CA         float32
snap_TX         float32
snap_WI         float32
sell_price      float32
lag_7           float32
lag_28          float32
rmean_7_7       float32
rmean_28_7      float32
rmean_7_28      float32
rmean_28_28     float32
week            int16
quarter         int16
mday            int16
dtypes: float32(10), int16(15)
memory usage: 2.8 GB


In [18]:

np.random.seed(777)

fake_valid_inds = np.random.choice(X.index.values, 2_000_000, replace = False)

In [19]:

train_inds = np.setdiff1d(X.index.values, fake_valid_inds)
train_data = lgb.Dataset(X.loc[train_inds] , label = y.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X.loc[fake_valid_inds], label = y.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

In [20]:
del X, y, fake_valid_inds, train_inds ; gc.collect()

20

In [21]:
fake_valid_data

<lightgbm.basic.Dataset at 0x7fb9848bc790>

In [22]:
!pip install lofo-importance

Collecting lofo-importance
  Downloading https://files.pythonhosted.org/packages/17/b4/ffa18bca0914b8bcac215fc4ee52970e91c15019f55821cb937ca1e15343/lofo_importance-0.2.5-py2.py3-none-any.whl
Installing collected packages: lofo-importance
Successfully installed lofo-importance-0.2.5


In [23]:
from sklearn.model_selection import KFold
from lofo import LOFOImportance, Dataset, plot_importance



In [24]:
from tqdm import tqdm_notebook as tqdm

In [25]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [26]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 



[20]	valid_0's rmse: 2.91255
[40]	valid_0's rmse: 2.5962
[60]	valid_0's rmse: 2.51642
[80]	valid_0's rmse: 2.49522
[100]	valid_0's rmse: 2.48517
[120]	valid_0's rmse: 2.47822
[140]	valid_0's rmse: 2.46942
[160]	valid_0's rmse: 2.46102
[180]	valid_0's rmse: 2.45421
[200]	valid_0's rmse: 2.44791
[220]	valid_0's rmse: 2.44336
[240]	valid_0's rmse: 2.43961
[260]	valid_0's rmse: 2.4325
[280]	valid_0's rmse: 2.42546
[300]	valid_0's rmse: 2.42066
[320]	valid_0's rmse: 2.41789
[340]	valid_0's rmse: 2.41397
[360]	valid_0's rmse: 2.41073
[380]	valid_0's rmse: 2.4063
[400]	valid_0's rmse: 2.40233
[420]	valid_0's rmse: 2.39793
[440]	valid_0's rmse: 2.39557
[460]	valid_0's rmse: 2.39276
[480]	valid_0's rmse: 2.39032
[500]	valid_0's rmse: 2.38757
[520]	valid_0's rmse: 2.38642
[540]	valid_0's rmse: 2.38391
[560]	valid_0's rmse: 2.38225
[580]	valid_0's rmse: 2.37963
[600]	valid_0's rmse: 2.37681
[620]	valid_0's rmse: 2.375
[640]	valid_0's rmse: 2.37404
[660]	valid_0's rmse: 2.37187
[680]	valid_0's rms

In [27]:
%%time

alphas = [1.028, 1.023, 1.018]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)    

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00
0 1.028 0.3333333333333333
0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2

In [None]:
sub.head(10)

In [None]:
sub.id.nunique(), sub["id"].str.contains("validation$").sum()

In [None]:
sub.shape