In [2]:
import pandas as pd
import numpy as np
import os
from m5_helpers.metrics import WRMSSEEvaluator
from m5_helpers.model_selection import M5TimeSeriesSplit

from IPython.display import display, HTML

%load_ext autoreload
%autoreload 2

In [4]:
import itertools

In [5]:
from sklearn.linear_model import Lasso, Ridge

In [3]:
calendar = pd.read_csv("data/calendar.csv")
prices = pd.read_csv("data/sell_prices.csv")
train = pd.read_csv("data/sales_train_validation.csv")
submission = pd.read_csv("data/sample_submission.csv")
evaluators = dict()

In [13]:
id_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

In [6]:

def get_param_grid(params_values, size=None):

    param_grid = []
    grid_values = list(itertools.product(*params_values.values()))
    if size is not None:
        indices = np.random.choice(len(grid_values), size, replace=False)
        grid_values = [grid_values[i] for i in indices]

    for values in grid_values:
        param_grid.append(dict(zip(params_values.keys(), values)))

    return param_grid

params_values = {
    "alpha": [100,10,1]
}


In [53]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))


reduce_mem_usage(train)

Mem. usage decreased to 95.00 Mb (78.7% reduction)


In [161]:
temp  = [(100, 5), (100, 50), (10, 5), (10, 50), (1, 5)]
indices = np.random.choice(len(temp), 2, replace=False)
[temp[i] for i in indices]

[(100, 5), (10, 50)]

[{'alpha': 10}, {'alpha': 100}, {'alpha': 1}]

array([1, 1, 1, ..., 0, 2, 0], dtype=int16)

In [60]:
from lightgbm import LGBMRegressor

In [80]:
slides = [0, 4, 8, 12, 16, 20, 24, 28]
params_values = {
#     "alpha": [1, 5, 10, 20]
    "colsample_bytree": [0.6,0.8,1]
}
scores = []
for params in get_param_grid(params_values):
    print(params, end="\n")
    for s in slides:
        print(s, end= "\t")
        train_sub = train.iloc[:,:-(s+1)]

        ts = M5TimeSeriesSplit(n_days=28, 
                               days_columns=train_sub.columns[6:].tolist(),
                               fixed_columns=train_sub.columns[:6].tolist(),
                               return_index=False,
                               rename=False,
                               sliding_window=True,
                               method=1,
                               split_train=True,
                               do_enumerate=True)


        predictions = pd.DataFrame()
        for i, X_tr, y_tr, X_te, y_te in ts.split(train_sub):
#             reg = Lasso(alpha=params["alpha"])
            reg = LGBMRegressor(**params)
            reg.fit(X_tr.drop(columns=id_columns).values, y_tr.values[:,0])
            predictions[train_sub.iloc[:,-28:].columns[i]] = reg.predict(X_te.drop(columns=id_columns).values)

            print(i, end= " ")

        if s not in evaluators:
            evaluators[s] = WRMSSEEvaluator(train_sub.iloc[:,:-28], train_sub.iloc[:,-28:], calendar, prices)

        score = evaluators[s].score(predictions)
        print(score)
        scores.append({"slide": s,
                       "score": score,
                       "params": params})

{'colsample_bytree': 0.6}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.120337888402874
4	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


1.263865439091137
8	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


1.035424247676447
12	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


0.9934696981105366
16	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


1.0432723771010493
20	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


1.6591786580200647
24	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


1.0264113503876917
28	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 0.8409579530876357
{'colsample_bytree': 0.8}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.1394881264640098
4	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.2608524472939981
8	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.0328921683542698
12	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 0.9803521148382653
16	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.050875895131707
20	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.6587903570872693
24	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.0243877014400566
28	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 0.8415749927408794
{'colsample_bytree': 1}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 

In [81]:
pd.DataFrame(scores)

Unnamed: 0,slide,score,params
0,0,1.120338,{'colsample_bytree': 0.6}
1,4,1.263865,{'colsample_bytree': 0.6}
2,8,1.035424,{'colsample_bytree': 0.6}
3,12,0.99347,{'colsample_bytree': 0.6}
4,16,1.043272,{'colsample_bytree': 0.6}
5,20,1.659179,{'colsample_bytree': 0.6}
6,24,1.026411,{'colsample_bytree': 0.6}
7,28,0.840958,{'colsample_bytree': 0.6}
8,0,1.139488,{'colsample_bytree': 0.8}
9,4,1.260852,{'colsample_bytree': 0.8}


In [63]:
pd.DataFrame(scores)

Unnamed: 0,slide,score,params
0,0,1.120338,{'colsample_bytree': 0.6}
1,5,1.121635,{'colsample_bytree': 0.6}
2,10,1.158968,{'colsample_bytree': 0.6}
3,28,0.840958,{'colsample_bytree': 0.6}
4,0,1.139488,{'colsample_bytree': 0.8}
5,5,1.115,{'colsample_bytree': 0.8}
6,10,1.152422,{'colsample_bytree': 0.8}
7,28,0.841575,{'colsample_bytree': 0.8}
8,0,1.14072,{'colsample_bytree': 1}
9,5,1.109559,{'colsample_bytree': 1}


In [26]:
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import Pipeline

In [30]:
reg.named_steps["scale"].transform(X_te.drop(columns=id_columns).values)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.03212463,
        0.12849851],
       [0.        , 0.        , 0.        , ..., 0.03686049, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.03136122, 0.07840304, 0.01568061, ..., 0.        , 0.        ,
        0.03136122],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01417905,
        0.0567162 ]])

In [36]:
# temp = reg.named_steps["las"]
np.sum(temp.coef_)

0.0

In [51]:
slides = [0, 28]
params_values = {
    "alpha": [1, 5, 10, 20]
}
scores = []
for params in get_param_grid(params_values):
    print(params, end="\n")
    for s in slides:
        print(s, end= "\t")
        train_sub = train.iloc[:,:-(s+1)]

        ts = M5TimeSeriesSplit(n_days=28, 
                               days_columns=train_sub.columns[6:].tolist(),
                               fixed_columns=train_sub.columns[:6].tolist(),
                               return_index=False,
                               rename=False,
                               sliding_window=True,
                               method=1,
                               split_train=True,
                               do_enumerate=True)


        predictions = pd.DataFrame()
        for i, X_tr, y_tr, X_te, y_te in ts.split(train_sub):
            reg = Pipeline([
#                 ('scale', Normalizer()),
                ('las', Lasso(alpha=params["alpha"], normalize=False))])

            reg.fit(X_tr.drop(columns=id_columns).values, y_tr)
            predictions[train_sub.iloc[:,-28:].columns[i]] = reg.predict(X_te.drop(columns=id_columns).values)

            print(i, end= " ")

        if s not in evaluators:
            evaluators[s] = WRMSSEEvaluator(train_sub.iloc[:,:-28], train_sub.iloc[:,-28:], calendar, prices)

        score = evaluators[s].score(predictions)
        print(score)
        scores.append({"slide": s,
                       "score": score,
                       "params": params})

{'alpha': 1}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 0.8952459016596177
28	0 1 2 3 4 5 6 7 8 

KeyboardInterrupt: 

In [49]:
slides = [0, 28]
params_values = {
    "ridge_alpha": [0.1, 1, 5, 10, 20]
}
for params in get_param_grid(params_values):
    print(params, end="\n")
    for s in slides:
        print(s, end= "\t")
        train_sub = train.iloc[:,:-(s+1)]

        ts = M5TimeSeriesSplit(n_days=28, 
                               days_columns=train_sub.columns[6:].tolist(),
                               fixed_columns=train_sub.columns[:6].tolist(),
                               return_index=False,
                               rename=False,
                               sliding_window=True,
                               method=1,
                               split_train=True,
                               do_enumerate=True)


        predictions = pd.DataFrame()
        for i, X_tr, y_tr, X_te, y_te in ts.split(train_sub):
            reg = Pipeline([
                ('las', Ridge(alpha=params["ridge_alpha"], normalize=True))])
            reg.fit(X_tr.drop(columns=id_columns).values, y_tr)
            predictions[train_sub.iloc[:,-28:].columns[i]] = reg.predict(X_te.drop(columns=id_columns).values)[:,0]

            print(i, end= " ")

        if s not in evaluators:
            evaluators[s] = WRMSSEEvaluator(train_sub.iloc[:,:-28], train_sub.iloc[:,-28:], calendar, prices)

        score = evaluators[s].score(predictions)
        print(score)
        scores.append({"slide": s,
                       "score": score,
                       "params": params})

{'ridge_alpha': 0.1}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.642956187863623
28	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 4.237438293187833
{'ridge_alpha': 1}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.3135852343832015
28	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 7.198577039882706
{'ridge_alpha': 5}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 3.8695674453697997
28	0 1 2 3 4 

KeyboardInterrupt: 

In [10]:
ts = M5TimeSeriesSplit(n_days=28, 
                       days_columns=train.columns[6:].tolist(),
                       fixed_columns=train.columns[:6].tolist(),
                       return_index=True,
                       rename=False,
                       sliding_window=True,
                       method=1,
                       split_train=True,
                       do_enumerate=True)

predictions = pd.DataFrame()
for i, X_tr, y_tr, X_te, y_te in ts.split(train):
    print(X_te[-5:])
    print(y_te)
    if i ==5:
        break

['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1886']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1887']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1888']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1889']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1890']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1891']


In [64]:
ts = M5TimeSeriesSplit(n_days=28, 
                       days_columns=train.columns[6:].tolist() + submission.columns[1:].tolist(),
                       fixed_columns=train.columns[:6].tolist(),
                       return_index=True,
                       rename=False,
                       sliding_window=True,
                       method=1,
                       split_train=True,
                       do_enumerate=True)

predictions = pd.DataFrame()
params = {'colsample_bytree': 0.6}
for i, X_tr_col, y_tr_col, X_te_col, y_te_col in ts.split(train):
    print(i)
    X_tr, y_tr, X_te = train[X_tr_col], train[y_tr_col], train[X_te_col]
    
    reg = LGBMRegressor(**params)
    reg.fit(X_tr.drop(columns=id_columns).values, y_tr)

    col_name = submission.columns[i+1]
    predictions[col_name] = reg.predict(X_te.drop(columns=id_columns).values)

0


  y = column_or_1d(y, warn=True)


1


  y = column_or_1d(y, warn=True)


2


  y = column_or_1d(y, warn=True)


3


  y = column_or_1d(y, warn=True)


4


  y = column_or_1d(y, warn=True)


5


  y = column_or_1d(y, warn=True)


6


  y = column_or_1d(y, warn=True)


7


  y = column_or_1d(y, warn=True)


8


  y = column_or_1d(y, warn=True)


9


  y = column_or_1d(y, warn=True)


10


  y = column_or_1d(y, warn=True)


11


  y = column_or_1d(y, warn=True)


12


  y = column_or_1d(y, warn=True)


13


  y = column_or_1d(y, warn=True)


14


  y = column_or_1d(y, warn=True)


15


  y = column_or_1d(y, warn=True)


16


  y = column_or_1d(y, warn=True)


17


  y = column_or_1d(y, warn=True)


18


  y = column_or_1d(y, warn=True)


19


  y = column_or_1d(y, warn=True)


20


  y = column_or_1d(y, warn=True)


21


  y = column_or_1d(y, warn=True)


22


  y = column_or_1d(y, warn=True)


23


  y = column_or_1d(y, warn=True)


24


  y = column_or_1d(y, warn=True)


25


  y = column_or_1d(y, warn=True)


26


  y = column_or_1d(y, warn=True)


27


  y = column_or_1d(y, warn=True)


In [68]:
submission.iloc[:predictions.shape[0],1:] = predictions.values

In [72]:
submission.to_csv("submissions/submission_4.csv", index=False)