In [1]:
import pandas as pd
import numpy as np
import os
from m5_helpers.metrics import WRMSSEEvaluator
from m5_helpers.model_selection import M5TimeSeriesSplit

from IPython.display import display, HTML

%load_ext autoreload
%autoreload 2

In [58]:
calendar = pd.read_csv("data/calendar.csv")
prices = pd.read_csv("data/sell_prices.csv")
train = pd.read_csv("data/sales_train_validation.csv")
submission = pd.read_csv("data/sample_submission.csv")

In [3]:
id_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]

In [32]:
ts = M5TimeSeriesSplit(n_days=28, 
                       days_columns=train.columns[6:].tolist(),
                       fixed_columns=train.columns[:6].tolist(),
                       return_index=False,
                       rename=False,
                       sliding_window=True,
                       method=1,
                       split_train=True,
                       do_enumerate=True)

In [33]:
from sklearn.linear_model import Lasso, Ridge

array([[ 0.79152245],
       [-0.25371545],
       [ 2.26235312],
       ...,
       [-1.50075844],
       [ 2.96439339],
       [ 8.90082422]])

In [41]:
predictions = pd.DataFrame()
for i, X_tr, y_tr, X_te, y_te in ts.split(train):
    
    reg = Ridge()
    reg.fit(X_tr.drop(columns=id_columns).values, y_tr)
    predictions[train.iloc[:,-28:].columns[i]] = reg.predict(X_te.drop(columns=id_columns).values)[:,0]

    print(i)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


In [49]:
e = WRMSSEEvaluator(train.iloc[:,:-28], train.iloc[:,-28:], calendar, prices)
predictions[predictions<0] = 0
e.score(predictions)

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))




6.7606507970081875

In [59]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))


reduce_mem_usage(train)

Mem. usage decreased to 95.00 Mb (78.7% reduction)


In [70]:
scores

[{'slide': 0, 'score': 0.8952459016596177},
 {'slide': 1, 'score': 1.616635784541945}]

In [66]:
slides = [0, 5, 10]
# evaluators = [None for i in range(len(slides))]
# scores = []
for s in range(slides):
    train_sub = train.iloc[:,:-(slides[s]+1)]

    ts = M5TimeSeriesSplit(n_days=28, 
                           days_columns=train_sub.columns[6:].tolist(),
                           fixed_columns=train_sub.columns[:6].tolist(),
                           return_index=False,
                           rename=False,
                           sliding_window=True,
                           method=1,
                           split_train=True,
                           do_enumerate=True)


    predictions = pd.DataFrame()
    for i, X_tr, y_tr, X_te, y_te in ts.split(train_sub):
        reg = Lasso()
        reg.fit(X_tr.drop(columns=id_columns).values, y_tr)
        predictions[train_sub.iloc[:,-28:].columns[i]] = reg.predict(X_te.drop(columns=id_columns).values)

        print(i)
        
    if evaluators[s] is None:
        evaluators[s] = WRMSSEEvaluator(train_sub.iloc[:,:-28], train_sub.iloc[:,-28:], calendar, prices)
        
    score = evaluators[s].score(predictions)
    print(score)
    scores.append({"slide": s,
                   "score": score})

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


0.8952459016596177
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


IndexError: list index out of range

In [161]:
temp  = [(100, 5), (100, 50), (10, 5), (10, 50), (1, 5)]
indices = np.random.choice(len(temp), 2, replace=False)
[temp[i] for i in indices]

[(100, 5), (10, 50)]

In [190]:
import itertools

def get_param_grid(params_values, size=None):

    param_grid = []
    grid_values = list(itertools.product(*params_values.values()))
    if size is not None:
        indices = np.random.choice(len(grid_values), size, replace=False)
        grid_values = [grid_values[i] for i in indices]

    for values in grid_values:
        param_grid.append(dict(zip(params_values.keys(), values)))

    return param_grid

params_values = {
    "alpha": [100,10,1]
}
get_param_grid(params_values, 3)

[{'alpha': 10}, {'alpha': 100}, {'alpha': 1}]

In [208]:
train_sub

2

In [221]:
slides = [0, 5, 10, 28]
evaluators = [None for i in range(len(slides))]
params_values = {
    "alpha": [1, 5, 10, 20]
}
scores = []
for params in get_param_grid(params_values):
    print(params, end="\n")
    for s in range(len(slides)):
        print(slides[s], end= "\t")
        train_sub = train.iloc[:,:-(slides[s]+1)]

        ts = M5TimeSeriesSplit(n_days=28, 
                               days_columns=train_sub.columns[6:].tolist(),
                               fixed_columns=train_sub.columns[:6].tolist(),
                               return_index=False,
                               rename=False,
                               sliding_window=True,
                               method=1,
                               split_train=True,
                               do_enumerate=True)


        predictions = pd.DataFrame()
        for i, X_tr, y_tr, X_te, y_te in ts.split(train_sub):
            reg = Lasso(alpha=params["alpha"])
            reg.fit(X_tr.drop(columns=id_columns).values, y_tr)
            predictions[train_sub.iloc[:,-28:].columns[i]] = reg.predict(X_te.drop(columns=id_columns).values)

            print(i, end= " ")

        if evaluators[s] is None:
            evaluators[s] = WRMSSEEvaluator(train_sub.iloc[:,:-28], train_sub.iloc[:,-28:], calendar, prices)

        score = evaluators[s].score(predictions)
        print(score)
        scores.append({"slide": slides[s],
                       "score": score,
                       "params": params})

{'alpha': 1}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


0.8952459016596177
5	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


1.616635784541945
10	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


1.6737230384432675
28	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


0.9039357819365291
{'alpha': 5}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.330967242252422
5	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.0437145208045777
10	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.0546468876085546
28	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.4540199311832558
{'alpha': 10}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.073407504038556
5	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.346290378675389
10	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.2936557393392363
28	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.098083756241625
{'alpha': 20}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.8206900054954134
5	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.366304

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
slides = [0, 28]
# evaluators = [None for i in range(len(slides))]
params_values = {
    "alpha": [0.001, 1, 5, 10, 20]
}
scores = []
for params in get_param_grid(params_values):
    print(params, end="\n")
    for s in range(len(slides)):
        print(slides[s], end= "\t")
        train_sub = train.iloc[:,:-(slides[s]+1)]

        ts = M5TimeSeriesSplit(n_days=28, 
                               days_columns=train_sub.columns[6:].tolist(),
                               fixed_columns=train_sub.columns[:6].tolist(),
                               return_index=False,
                               rename=False,
                               sliding_window=True,
                               method=1,
                               split_train=True,
                               do_enumerate=True)


        predictions = pd.DataFrame()
        for i, X_tr, y_tr, X_te, y_te in ts.split(train_sub):
            reg = Pipeline([
                ('scale', StandardScaler()),
                ('las', Lasso(alpha=params["alpha"], normalize=False))])
            
            reg.fit(X_tr.drop(columns=id_columns).values, y_tr)
            predictions[train_sub.iloc[:,-28:].columns[i]] = reg.predict(X_te.drop(columns=id_columns).values)

            print(i, end= " ")

        if evaluators[s] is None:
            evaluators[s] = WRMSSEEvaluator(train_sub.iloc[:,:-28], train_sub.iloc[:,-28:], calendar, prices)

        score = evaluators[s].score(predictions)
        print(score)
        scores.append({"slide": slides[s],
                       "score": score,
                       "params": params})

{'alpha': 0.001}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 1.1857230061799293
28	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


0.9841960826181746
{'alpha': 1}
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2.9690158158424516
28	0 1 2 3 4 5 

KeyboardInterrupt: 

In [None]:
params_values = {
    "ridge_alpha": [1, 5, 10, 20]
}
for params in get_param_grid(params_values):
    print(params, end="\n")
    for s in range(len(slides)):
        print(slides[s], end= "\t")
        train_sub = train.iloc[:,:-(slides[s]+1)]

        ts = M5TimeSeriesSplit(n_days=28, 
                               days_columns=train_sub.columns[6:].tolist(),
                               fixed_columns=train_sub.columns[:6].tolist(),
                               return_index=False,
                               rename=False,
                               sliding_window=True,
                               method=1,
                               split_train=True,
                               do_enumerate=True)


        predictions = pd.DataFrame()
        for i, X_tr, y_tr, X_te, y_te in ts.split(train_sub):
            reg = Ridge(alpha=params["ridge_alpha"])
            reg.fit(X_tr.drop(columns=id_columns).values, y_tr)
            predictions[train_sub.iloc[:,-28:].columns[i]] = reg.predict(X_te.drop(columns=id_columns).values)

            print(i, end= " ")

        if evaluators[s] is None:
            evaluators[s] = WRMSSEEvaluator(train_sub.iloc[:,:-28], train_sub.iloc[:,-28:], calendar, prices)

        score = evaluators[s].score(predictions)
        print(score)
        scores.append({"slide": slides[s],
                       "score": score,
                       "params": params})

In [10]:
ts = M5TimeSeriesSplit(n_days=28, 
                       days_columns=train.columns[6:].tolist(),
                       fixed_columns=train.columns[:6].tolist(),
                       return_index=True,
                       rename=False,
                       sliding_window=True,
                       method=1,
                       split_train=True,
                       do_enumerate=True)

predictions = pd.DataFrame()
for i, X_tr, y_tr, X_te, y_te in ts.split(train):
    print(X_te[-5:])
    print(y_te)
    if i ==5:
        break

['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1886']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1887']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1888']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1889']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1890']
['d_1881', 'd_1882', 'd_1883', 'd_1884', 'd_1885']
['d_1891']


In [11]:
ts = M5TimeSeriesSplit(n_days=28, 
                       days_columns=train.columns[6:].tolist() + submission.columns[1:].tolist(),
                       fixed_columns=train.columns[:6].tolist(),
                       return_index=True,
                       rename=False,
                       sliding_window=True,
                       method=1,
                       split_train=True,
                       do_enumerate=True)

predictions = pd.DataFrame()
for i, X_tr_col, y_tr_col, X_te_col, y_te_col in ts.split(train):
    print(i)
    X_tr, y_tr, X_te = train[X_tr_col], train[y_tr_col], train[X_te_col]
    
    reg = Lasso()
    reg.fit(X_tr.drop(columns=id_columns).values, y_tr)

    col_name = submission.columns[i+1]
    predictions[col_name] = reg.predict(X_te.drop(columns=id_columns).values)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


KeyboardInterrupt: 

In [None]:
submission.iloc[:predictions.shape[0],1:] = predictions.values

In [None]:
submission.to_csv("submissions/submission_3.csv", index=False)