In [1]:
import pandas as pd
import numpy as np
import os
from m5_helpers.metrics import WRMSSEEvaluator
from m5_helpers.model_selection import M5TimeSeriesSplit
from m5_helpers.misc import reduce_mem_usage, get_param_grid, get_latest_same_weekday
from IPython.display import display, HTML

%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
calendar = pd.read_csv("data/calendar.csv")
prices = pd.read_csv("data/sell_prices.csv")
train = pd.read_csv("data/sales_train_validation.csv")
submission = pd.read_csv("data/sample_submission.csv")
evaluators = dict()
scores = []

In [4]:
id_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
sales_columns = [f"d_{i}" for i in range(1, 1+int(train.columns[-1][-4:]))]

In [5]:
reduce_mem_usage(train)

Mem. usage decreased to 95.00 Mb (78.7% reduction)


In [372]:
weights = evaluators[0].weights.reset_index()
weights.columns = ["weight_id", "weight"]
train["weight_id"] = train["item_id"] + "--" + train["store_id"]

id_columns = id_columns + ["weight_id", "weight"]

train = pd.merge(train, weights, on="weight_id", how="left")
train = train[id_columns + sales_columns]
train["weight"] = train.weight * 10000

In [155]:
value_vars = [f"d_{i}" for i in range(1913-27, 1914)]
id_vars = [i for i in train.columns if i not in value_vars]
train_m = train.melt(value_vars=value_vars, id_vars=id_vars)

In [162]:
train_m.head().merge(calendar[["d", "wday"]], left_on="variable", right_on="d")

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1880,d_1881,d_1882,d_1883,d_1884,d_1885,variable,value,d,wday
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,2,0,1,1,1,d_1886,1,d_1886,3
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,1,1,1,1,d_1886,1,d_1886,3
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,1,1,0,d_1886,0,d_1886,3
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,5,0,6,6,d_1886,0,d_1886,3
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,3,1,0,0,0,0,d_1886,1,d_1886,3


In [140]:
slides = [28, 32, 0]
params_values = {
#     "alpha": [1, 5, 10, 20]
    "colsample_bytree": [0.6],
    "n_estimators": [100]
}

submission_cols_transformed = [f"d_{int(train.columns[-1][-4:]) + int(i[1:])}" for i in submission.columns[1:]]
all_sales_columns = sales_columns + submission_cols_transformed

for params in get_param_grid(params_values):
    print(params, end="\n")
    for s in slides:
        print(s, end= "\t")
        slided_sales_columns = all_sales_columns[:len(all_sales_columns)-s]
        all_slided_columns = id_columns + slided_sales_columns
        
        if s not in evaluators and np.all(np.isin(all_slided_columns, train.columns)):
            evaluators[s] = WRMSSEEvaluator(
                train.loc[:,all_slided_columns[:-28]],
                train.loc[:,all_slided_columns[-28:]],
                calendar, 
                prices)
            
            
        ts = M5TimeSeriesSplit(n_days=28, 
                               days_columns=slided_sales_columns,
                               fixed_columns=id_columns,
                               return_index=True,
                               rename=False,
                               sliding_window=True,
                               method=1,
                               split_train=False,
                               do_enumerate=True)


        predictions = pd.DataFrame()
        for i, X_tr_col, y_tr_col, X_te_col, y_te_col in ts.split(None):
            X_tr, X_te = train[X_tr_col], train[X_te_col]
                        
            day_test = int(y_te_col[0][2:])
            max_day_train = int(X_tr.columns[-1][-4:])
            day_train = get_latest_same_weekday(day_test, max_day_train)

            y_tr = X_tr.drop(columns=id_columns).loc[:, f"d_{day_train}"]
            X_tr = X_tr.drop(columns=id_columns).loc[:, :f"d_{day_train-i-1}"]
            X_te = X_te.drop(columns=id_columns).iloc[:,-X_tr.shape[1]:]

            reg = LGBMRegressor(**params)
            reg.fit(X_tr.values,
                    y_tr.values)
            predictions[y_te_col[0]] = reg.predict(X_te.values)
            
            print(i, end=" ")
        
        try:
            score = evaluators[s].score(predictions)
            print(score)
            scores.append({"slide": s,
                           "score": score,
                           "params": params})
        except:
            print("couldn't find an evaluator")


{'colsample_bytree': 0.6, 'n_estimators': 100}
28	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 0.7417564626861195
32	

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42840), HTML(value='')))


0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 0.8624385600477226
0	0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 couldn't find an evaluator


In [141]:
predictions

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,0.920631,0.820418,0.660337,0.816939,0.855827,0.990675,1.021169,0.848190,0.729677,0.638372,...,0.717335,0.768788,0.959128,0.834501,0.687194,0.611192,0.623504,0.904162,1.088124,0.878956
1,0.201179,0.145694,0.205044,0.212516,0.207973,0.195447,0.201112,0.167159,0.162663,0.184801,...,0.254524,0.268212,0.247459,0.227544,0.175558,0.224788,0.169157,0.347697,0.368270,0.367295
2,0.434735,0.522380,0.536358,0.448343,0.564696,0.675120,0.551327,0.518241,0.580412,0.415860,...,0.550254,0.763403,0.674762,0.484356,0.408435,0.352621,0.468234,0.557148,0.725674,0.716241
3,1.565242,1.385682,1.249325,1.346097,1.644295,2.609890,2.829781,1.985841,1.665019,2.167715,...,2.154914,2.403448,2.542587,1.842900,1.951591,1.676979,1.534353,2.100628,2.472813,3.754027
4,1.078194,1.070272,1.102865,1.072075,1.106309,1.388523,1.489489,1.074109,1.024830,1.070487,...,1.196606,1.936835,1.283597,1.040377,1.016422,1.154101,1.134451,1.112468,1.505579,1.640970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0.235468,0.326595,0.379497,0.275591,0.418147,0.389748,0.446792,0.416869,0.282070,0.508004,...,0.381917,0.524653,0.402072,0.456782,0.345637,0.454466,0.507872,0.553502,0.623674,0.422664
30486,0.322551,0.222830,0.184301,0.207248,0.273237,0.177452,0.284710,0.209363,0.218481,0.214021,...,0.231854,0.271581,0.319099,0.229146,0.311119,0.229469,0.221545,0.234723,0.284413,0.287990
30487,0.938396,1.524763,0.721209,0.750656,0.909649,1.035076,0.813100,0.755139,0.580639,0.661715,...,0.697878,0.860734,0.986114,0.793383,0.822276,0.754982,0.619122,0.963764,0.918877,0.831132
30488,1.010340,0.979058,0.847334,1.303045,1.136522,1.122040,1.407342,0.869307,0.909752,0.455478,...,0.940405,1.144965,1.996188,1.353046,0.799975,0.800126,0.942748,1.407165,1.103461,1.448778


In [142]:
all_columns = train.columns.tolist() + submission.columns[1:].tolist()

ts = M5TimeSeriesSplit(n_days=30, 
                       days_columns=[i for i in all_columns
                                      if i not in id_columns],
                       fixed_columns=id_columns,
                       return_index=True,
                       rename=False,
                       sliding_window=True,
                       method=1,
                       split_train=False,
                       do_enumerate=True)


for i, X_tr_col, y_tr_col, X_te_col, y_te_col in ts.split(None):
    print(y_te_col)

['d_1912']
['d_1913']
['F1']
['F2']
['F3']
['F4']
['F5']
['F6']
['F7']
['F8']
['F9']
['F10']
['F11']
['F12']
['F13']
['F14']
['F15']
['F16']
['F17']
['F18']
['F19']
['F20']
['F21']
['F22']
['F23']
['F24']
['F25']
['F26']
['F27']
['F28']


In [12]:
score = evaluators[s-28].score(predictions)
print(score)
scores.append({"slide": s,
               "score": score,
               "params": params})

Exception ignored in: <generator object tqdm_notebook.__iter__ at 0x7fc630beced0>
Traceback (most recent call last):
  File "/Users/joelponte/opt/anaconda3/envs/machine-learning-platform/lib/python3.7/site-packages/tqdm/notebook.py", line 227, in __iter__
    self.sp(bar_style='danger')
AttributeError: 'tqdm_notebook' object has no attribute 'sp'


KeyError: "Columns not found: 'd_1888', 'd_1910', 'd_1890', 'd_1903', 'd_1900', 'd_1913', 'd_1902', 'd_1892', 'd_1895', 'd_1906', 'd_1905', 'd_1896', 'd_1897', 'd_1889', 'd_1901', 'd_1908', 'd_1904', 'd_1894', 'd_1893', 'd_1891', 'd_1887', 'd_1886', 'd_1911', 'd_1907', 'd_1912', 'd_1899', 'd_1909', 'd_1898'"

In [None]:
ts = M5TimeSeriesSplit(n_days=28, 
                       days_columns=[i for i in train.columns
                                      if i not in id_columns] + \
                       submission.columns[1:].tolist(),
                       fixed_columns=id_columns,
                       return_index=True,
                       rename=False,
                       sliding_window=True,
                       method=1,
                       split_train=False,
                       do_enumerate=True)

predictions = pd.DataFrame()
params = {'colsample_bytree': 0.6}
for i, X_tr_col, y_tr_col, X_te_col, y_te_col in ts.split(train):
    print(i, end=" ")
    X_tr, X_te = train[X_tr_col], train[X_te_col]
    
    day_test = int(X_te.columns[-1][-4:]) + int(y_te_col[0][1:])
    max_day_train = int(X_tr.columns[-1][-4:])
    day_train = get_latest_same_weekday(day_test, max_day_train)

    y_tr = X_tr.drop(columns=id_columns).loc[:, f"d_{day_train}"]
    X_tr = X_tr.drop(columns=id_columns).loc[:, :f"d_{day_train-i-1}"]
    X_te = X_te.drop(columns=id_columns).iloc[:,-X_tr.shape[1]:]
    
    reg = LGBMRegressor(**params)
    reg.fit(X_tr.values, y_tr)

    col_name = submission.columns[i+1]
    predictions[col_name] = reg.predict(X_te.values)


In [611]:
ts = M5TimeSeriesSplit(n_days=28, 
                       days_columns=[i for i in train.columns
                                      if i not in id_columns] + \
                       submission.columns[1:].tolist(),
                       fixed_columns=id_columns,
                       return_index=True,
                       rename=False,
                       sliding_window=True,
                       method=1,
                       split_train=False,
                       do_enumerate=True)

predictions = pd.DataFrame()
params = {'colsample_bytree': 0.6}
for i, X_tr_col, y_tr_col, X_te_col, y_te_col in ts.split(train):
    print(i, end=" ")
    X_tr, X_te = train[X_tr_col], train[X_te_col]
    
    day_test = int(X_te.columns[-1][-4:]) + int(y_te_col[0][1:])
    max_day_train = int(X_tr.columns[-1][-4:])
    day_train = get_latest_same_weekday(day_test, max_day_train)

    y_tr = X_tr.drop(columns=id_columns).loc[:, f"d_{day_train}"]
    X_tr = X_tr.drop(columns=id_columns).loc[:, :f"d_{day_train-i-1}"]
    X_te = X_te.drop(columns=id_columns).iloc[:,-X_tr.shape[1]:]
    
    reg = LGBMRegressor(**params)
    reg.fit(X_tr.values, y_tr)

    col_name = submission.columns[i+1]
    predictions[col_name] = reg.predict(X_te.values)


0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 

In [612]:
predictions

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,0.920631,0.820418,0.660337,0.816939,0.855827,0.990675,1.021169,0.848190,0.729677,0.638372,...,0.717335,0.768788,0.959128,0.834501,0.687194,0.611192,0.623504,0.904162,1.088124,0.878956
1,0.201179,0.145694,0.205044,0.212516,0.207973,0.195447,0.201112,0.167159,0.162663,0.184801,...,0.254524,0.268212,0.247459,0.227544,0.175558,0.224788,0.169157,0.347697,0.368270,0.367295
2,0.434735,0.522380,0.536358,0.448343,0.564696,0.675120,0.551327,0.518241,0.580412,0.415860,...,0.550254,0.763403,0.674762,0.484356,0.408435,0.352621,0.468234,0.557148,0.725674,0.716241
3,1.565242,1.385682,1.249325,1.346097,1.644295,2.609890,2.829781,1.985841,1.665019,2.167715,...,2.154914,2.403448,2.542587,1.842900,1.951591,1.676979,1.534353,2.100628,2.472813,3.754027
4,1.078194,1.070272,1.102865,1.072075,1.106309,1.388523,1.489489,1.074109,1.024830,1.070487,...,1.196606,1.936835,1.283597,1.040377,1.016422,1.154101,1.134451,1.112468,1.505579,1.640970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0.235468,0.326595,0.379497,0.275591,0.418147,0.389748,0.446792,0.416869,0.282070,0.508004,...,0.381917,0.524653,0.402072,0.456782,0.345637,0.454466,0.507872,0.553502,0.623674,0.422664
30486,0.322551,0.222830,0.184301,0.207248,0.273237,0.177452,0.284710,0.209363,0.218481,0.214021,...,0.231854,0.271581,0.319099,0.229146,0.311119,0.229469,0.221545,0.234723,0.284413,0.287990
30487,0.938396,1.524763,0.721209,0.750656,0.909649,1.035076,0.813100,0.755139,0.580639,0.661715,...,0.697878,0.860734,0.986114,0.793383,0.822276,0.754982,0.619122,0.963764,0.918877,0.831132
30488,1.010340,0.979058,0.847334,1.303045,1.136522,1.122040,1.407342,0.869307,0.909752,0.455478,...,0.940405,1.144965,1.996188,1.353046,0.799975,0.800126,0.942748,1.407165,1.103461,1.448778


In [599]:
calendar[calendar.d=="d_1913"]

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1912,2016-04-24,11613,Sunday,2,4,2016,d_1913,,,,,0,0,0


In [598]:
calendar[calendar.d=="d_1934"]

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1933,2016-05-15,11616,Sunday,2,5,2016,d_1934,,,,,0,1,1


In [616]:
submission.iloc[:predictions.shape[0],1:] = predictions.values

In [617]:
submission

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.920631,0.820418,0.660337,0.816939,0.855827,0.990675,1.021169,0.848190,0.729677,...,0.717335,0.768788,0.959128,0.834501,0.687194,0.611192,0.623504,0.904162,1.088124,0.878956
1,HOBBIES_1_002_CA_1_validation,0.201179,0.145694,0.205044,0.212516,0.207973,0.195447,0.201112,0.167159,0.162663,...,0.254524,0.268212,0.247459,0.227544,0.175558,0.224788,0.169157,0.347697,0.368270,0.367295
2,HOBBIES_1_003_CA_1_validation,0.434735,0.522380,0.536358,0.448343,0.564696,0.675120,0.551327,0.518241,0.580412,...,0.550254,0.763403,0.674762,0.484356,0.408435,0.352621,0.468234,0.557148,0.725674,0.716241
3,HOBBIES_1_004_CA_1_validation,1.565242,1.385682,1.249325,1.346097,1.644295,2.609890,2.829781,1.985841,1.665019,...,2.154914,2.403448,2.542587,1.842900,1.951591,1.676979,1.534353,2.100628,2.472813,3.754027
4,HOBBIES_1_005_CA_1_validation,1.078194,1.070272,1.102865,1.072075,1.106309,1.388523,1.489489,1.074109,1.024830,...,1.196606,1.936835,1.283597,1.040377,1.016422,1.154101,1.134451,1.112468,1.505579,1.640970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60976,FOODS_3_824_WI_3_evaluation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60977,FOODS_3_825_WI_3_evaluation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60978,FOODS_3_826_WI_3_evaluation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [618]:
# submission.to_csv("submissions/submission_5.csv", index=False)