In [2]:
!pip install lightgbm

Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/0b/9d/ddcb2f43aca194987f1a99e27edf41cf9bc39ea750c3371c2a62698c509a/lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 15.1MB/s ta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1
[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [2]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [3]:
pd.options.display.max_columns = 50

In [4]:
h = 28 
max_lags = 70
tr_last = 1913
fday = datetime(2016,4, 25) 
fday

datetime.datetime(2016, 4, 25, 0, 0)

In [16]:
import boto3
import os

BUCKET = 'dtci-dataplatform-telemetry-datsci-dev-bucket'
S3_PATH = 'Jason/m5'
client = boto3.client('s3')


In [17]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    filename = 'sell_prices.csv'
    obj = client.get_object(Bucket = BUCKET, Key = os.path.join(S3_PATH, filename))
    prices =  pd.read_csv(obj['Body'], dtype = PRICE_DTYPES)
    
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
    
    filename = 'calendar.csv'
    obj = client.get_object(Bucket = BUCKET, Key = os.path.join(S3_PATH, filename))
    cal =  pd.read_csv(obj['Body'], dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    
    filename = 'sales_train_validation.csv'
    obj = client.get_object(Bucket = BUCKET, Key = os.path.join(S3_PATH, filename))
    dt = pd.read_csv(obj['Body'], nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [18]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [19]:
FIRST_DAY = 350 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !

In [20]:
%%time

df = create_dt(is_train=True)
df.shape

KeyboardInterrupt: 

### Load Dataset

In [47]:
first_day = 1200

In [32]:
filename = 'sell_prices.csv'
obj = client.get_object(Bucket = BUCKET, Key = os.path.join(S3_PATH, filename))
prices =  pd.read_csv(obj['Body'], dtype = PRICE_DTYPES)

In [33]:
for col, col_dtype in PRICE_DTYPES.items():
    if col_dtype == "category":
        prices[col] = prices[col].cat.codes.astype("int16")
        prices[col] -= prices[col].min()

In [37]:
filename = 'calendar.csv'
obj = client.get_object(Bucket = BUCKET, Key = os.path.join(S3_PATH, filename))
cal =  pd.read_csv(obj['Body'], dtype = CAL_DTYPES)

In [71]:
cal.tail()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1964,2016-06-15,11620,6,5,6,2016,d_1965,0,0,0,0,0.0,1.0,1.0
1965,2016-06-16,11620,4,6,6,2016,d_1966,0,0,0,0,0.0,0.0,0.0
1966,2016-06-17,11620,0,7,6,2016,d_1967,0,0,0,0,0.0,0.0,0.0
1967,2016-06-18,11621,2,1,6,2016,d_1968,0,0,0,0,0.0,0.0,0.0
1968,2016-06-19,11621,3,2,6,2016,d_1969,17,4,3,1,0.0,0.0,0.0


In [41]:
cal["date"] = pd.to_datetime(cal["date"])
for col, col_dtype in CAL_DTYPES.items():
    if col_dtype == "category":
        cal[col] = cal[col].cat.codes.astype("int16")
        cal[col] -= cal[col].min()

In [44]:
tr_last

1913

In [100]:
is_train=True
nrows = None

In [101]:
start_day = max(1 if is_train  else tr_last-max_lags, first_day)
numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
dtype = {numcol:"float32" for numcol in numcols} 
dtype.update({col: "category" for col in catcols if col != "id"})

In [102]:
filename = 'sales_train_validation.csv'
obj = client.get_object(Bucket = BUCKET, Key = os.path.join(S3_PATH, filename))
dt = pd.read_csv(obj['Body'], nrows = nrows, usecols = catcols + numcols, dtype = dtype)

In [103]:
for col in catcols:
    if col != "id":
        dt[col] = dt[col].cat.codes.astype("int16")
        dt[col] -= dt[col].min()

In [104]:
if not is_train:
    for day in range(tr_last+1, tr_last+ 28 +1):
        dt[f"d_{day}"] = np.nan

In [105]:
dt = pd.melt(dt,
              id_vars = catcols,
              value_vars = [col for col in dt.columns if col.startswith("d_")],
              var_name = "d",
              value_name = "sales")

In [106]:
dt = dt.merge(cal, on= "d", copy = False)
dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)

In [107]:
lags = [7, 28]
lag_cols = [f"lag_{lag}" for lag in lags ]

In [108]:
lag_cols

['lag_7', 'lag_28']

In [109]:
lags = [7, 28]
lag_cols = [f"lag_{lag}" for lag in lags ]
for lag, lag_col in zip(lags, lag_cols):
    dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

In [110]:
wins = [7, 28]
for win in wins :
    for lag,lag_col in zip(lags, lag_cols):
        dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

In [111]:
date_features = {

    "wday": "weekday",
    "week": "weekofyear",
    "month": "month",
    "quarter": "quarter",
    "year": "year",
    "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
}

In [112]:
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)

for date_feat_name, date_feat_func in date_features.items():
    if date_feat_name in dt.columns:
        dt[date_feat_name] = dt[date_feat_name].astype("int16")
    else:
        dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [148]:
df = dt

In [149]:
df.dropna(inplace = True)
df.shape

(19613851, 31)

In [150]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [151]:
np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)

In [152]:
fake_valid_inds

array([18145075, 10334553, 15161275, ..., 15698270,  9414590, 16811310])

In [154]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

CPU times: user 15.7 s, sys: 1.92 s, total: 17.7 s
Wall time: 12 s


In [155]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

0

In [36]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 2000,
    'num_leaves': 128,
    "min_data_in_leaf": 50,
}

In [None]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=100) 



[100]	valid_0's rmse: 2.46142
[200]	valid_0's rmse: 2.4304
[300]	valid_0's rmse: 2.40185
[400]	valid_0's rmse: 2.38303
[500]	valid_0's rmse: 2.36535
[600]	valid_0's rmse: 2.35597
[700]	valid_0's rmse: 2.34579
[800]	valid_0's rmse: 2.33803
[900]	valid_0's rmse: 2.3319
[1000]	valid_0's rmse: 2.32648
[1100]	valid_0's rmse: 2.32117
[1200]	valid_0's rmse: 2.31492
[1300]	valid_0's rmse: 2.31062
[1400]	valid_0's rmse: 2.30686
[1500]	valid_0's rmse: 2.30331


In [None]:
m_lgb.save_model("model.lgb")

In [None]:
%%time

alphas = [1.035, 1.03, 1.025]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(icount, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)

0 2016-04-25 00:00:00
0 2016-04-26 00:00:00
0 2016-04-27 00:00:00
0 2016-04-28 00:00:00
0 2016-04-29 00:00:00
0 2016-04-30 00:00:00
0 2016-05-01 00:00:00
0 2016-05-02 00:00:00
0 2016-05-03 00:00:00
0 2016-05-04 00:00:00
