# M5 Forecasting Challenge Submission by Josh Li

Makridakis Competition, hosted on Kaggle.com, datafrom Walmart

# Imports

In [2]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
from sklearn import preprocessing
import matplotlib.pyplot as plt
import time 
import seaborn as sns
from sklearn.metrics import mean_squared_error
pd.options.display.max_columns = 50 

import lightgbm as lgb
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import clear_output


# All functions

In [15]:
def reduce_mem_usage(df, verbose=False):
    '''downcasting data'''
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print("Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def read_data(start_day = 0, last_day = 1913):
    '''reading data in from kaggle, locally or google drive (using Colab)'''    
    INPUT_DIR = f"/content/drive/My Drive/dataInput" ##f"../input/m5-forecasting-accuracy"

    print("Reading files...")
    calendar = pd.read_csv(f"{INPUT_DIR}/calendar.csv").pipe(reduce_mem_usage)
    prices = pd.read_csv(f"{INPUT_DIR}/sell_prices.csv").pipe(reduce_mem_usage)

    cat_cols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    numcols = [f"d_{day}" for day in range(start_day,last_day+1)]
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in cat_cols if col != "id"})

    sales = pd.read_csv(f"{INPUT_DIR}/sales_train_evaluation.csv",
                        usecols = cat_cols + numcols, dtype = dtype
                        ).pipe(
        reduce_mem_usage
    )
    submission = pd.read_csv(f"{INPUT_DIR}/sample_submission.csv").pipe(
        reduce_mem_usage
    )

    print("sales shape:", sales.shape)
    print("prices shape:", prices.shape)
    print("calendar shape:", calendar.shape)
    print("submission shape:", submission.shape)

    # calendar shape: (1969, 14)
    # sell_prices shape: (6841121, 4)
    # sales_train_val shape: (30490, 1919)
    # submission shape: (60980, 29)

    return sales, prices, calendar, submission

def encode_categorical(df, cols):
    '''encoding using label encoder'''
    df = df.copy()
    for col in cols:
        # Leave NaN as it is.
        le = preprocessing.LabelEncoder()
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(le.fit_transform(not_null), index=not_null.index)

    return df

def extract_num(ser):
    '''extract from d_ column'''
    return ser.str.extract(r"(\d+)").astype(np.int16)

def reshape_sales(sales, day_a, day_z, submission = None, verbose=True):
    '''melting main dataframe, adding in dates and features'''
    # melt sales data, get it ready for training
    id_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]

    # get product table.
    product = sales[id_columns]

    sales = sales.melt(id_vars=id_columns, var_name="d", value_name="demand",)
    sales = reduce_mem_usage(sales)
    print(submission.shape, end = ' editing ')
    submission = submission[submission.id.isin(sales.id.unique())]
    print(submission.shape)
    # separate test dataframes.
    if isinstance(submission, pd.DataFrame):
      print('Adding Evaluation Columns')
      vals = submission[submission["id"].str.endswith("validation")].copy()
      evals = submission[submission["id"].str.endswith("evaluation")].copy()

      # change column names.
      vals.columns = ["id"] + [f"d_{d}" for d in range(1914, 1914 + 28)]
      evals.columns = ["id"] + [f"d_{d}" for d in range(1942, 1942 + 28)]

    # merge with product table
      vals = vals.merge(product, how="left", on="id")
      evals = evals.merge(product, how="left", on="id")
      
      vals = vals.melt(id_vars=id_columns, var_name="d", value_name="demand")
      evals = evals.melt(id_vars=id_columns, var_name="d", value_name="demand")

      sales["part"] = "train"
      vals["part"] = "validation"
      evals["part"] = "evaluation"

      data = pd.concat([sales, vals, evals], axis=0)

      del sales, vals, evals
    
    else:
      data = sales
      del sales

    data["d"] = extract_num(data["d"])
    data = data[(data["d"] >= day_a)&
                (data["d"] <= day_z)]

      # delete evaluation for now.
      # data = data[data["part"] != "evaluation"]

    gc.collect()
    return data

def merge_calendar(data, calendar):
    '''adding calendar to dfsales'''
    calendar = calendar.drop(["weekday", "wday", "month", "year"], axis=1)
    return data.merge(calendar, how="left", on="d")

def merge_prices(data, prices):
    '''adding prices to dfsales'''
    return data.merge(prices, how="left", on=["store_id", "item_id", "wm_yr_wk"])

def add_lag_old(dt):
    '''adding lag and rolling lag features (this takes a while)'''
    print('Entering Lag')
    ts = time.time()
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)
    print("--- {:.2f}s seconds ---".format(time.time() - ts))
    print('Entering wins') 

    wins = [3, 7, 28, 56] ########
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    print("--- {:.2f}s seconds ---".format(time.time() - ts))

    return dt

def add_time_features(df, dt_col):
    '''adding time features'''
    df[dt_col] = pd.to_datetime(df[dt_col])
    attrs = [
        "year",
        "quarter",
        "month",
        "week",
        "day",
        "dayofweek",
    ]

    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        df[attr] = getattr(df[dt_col].dt, attr).astype(dtype)

    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
    return df

def getlag_fortest(tst, day, train_cols):
    '''generating the same lage features, recursively during prediction'''
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags]

    for lag, lag_col in zip(lags, lag_cols):
        tst.loc[tst.d == day, lag_col] = \
            tst.loc[tst.d == day-lag, 'sales'].values  # !!! main

    windows = [3, 7, 28, 56] ########
    for window in windows:
        for lag in lags:
            df_window = tst[(tst.d <= day-lag) & (tst.d> day-(lag+window))]
            df_window_grouped = df_window.groupby("id").agg({'sales':'mean'}).reindex(tst.loc[tst.d==day,'id'])
            tst.loc[tst.d == day,f"rmean_{lag}_{window}"] = \
                df_window_grouped.sales.values

    return tst.loc[tst.d == day, train_cols]

def setnan_testlag(te):
    '''making the empty lag columns in test'''
    for lag in ['lag_7', 'lag_28',
        'rmean_7_3', 'rmean_28_3', 'rmean_7_7', 'rmean_28_7', 'rmean_7_28',
        'rmean_28_28', 'rmean_7_56', 'rmean_28_56',]:
      te.loc[te.d.isin(list(range(test_a, test_z))), lag] = np.NaN
    return te

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Set parameters
read in -> encode -> get data

In [4]:
### hyperparameters, originally optimized by kaggle community, and a little fine tuning by me
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 400, ## original set at 1200
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [5]:
### setting train and time date ranges 
t_s = time.time()
DAY_A, train_a = 790,790
train_z = 1941
test_a = train_z + 1
test_z = train_z + 1 + 27 ## both inclusive
DAY_Z = test_z
print(train_a, train_z, test_a, test_z)

790 1941 1942 1969


In [6]:
t_s = time.time()
sales, prices, calendar, submission = read_data(start_day=DAY_A, last_day=train_z)
print("--- {:.2f}s seconds ---".format(time.time() - t_s))

Reading files...
sales shape: (30490, 1158)
prices shape: (6841121, 4)
calendar shape: (1969, 14)
submission shape: (60980, 29)
--- 10.25s seconds ---


## MAIN LOOP

In [7]:
def makepreds_persubsample(subval, sales, prices, calendar, submission): 
  ''' uses all the function above to train and make predictions on a subset (store) of the data
  returns predictions '''
  print("-----------------------------------------------------")  
  print('-----Investigating: ',end = subval)
  sales = sales[sales['store_id'].str.contains(subval)]

  print(sales.shape)

  calendar = encode_categorical(
      calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
  ).pipe(reduce_mem_usage)
  sales = encode_categorical(
      sales, ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
  ).pipe(reduce_mem_usage)
  prices = encode_categorical(prices, ["item_id", "store_id"]).pipe(reduce_mem_usage)
  data = reshape_sales(sales, day_a = DAY_A, day_z = DAY_Z, verbose = False, submission = submission.copy())
  del sales
  gc.collect()
  calendar["d"] = extract_num(calendar["d"])
  data = merge_calendar(data, calendar)
  del calendar
  gc.collect()
  data = merge_prices(data, prices)
  del prices
  gc.collect()
  print("---Melt and Merged --- {:.2f}s seconds ---".format(time.time() - t_s))
  data = reduce_mem_usage(data)
  data = add_lag_old(data.rename(columns={'demand':'sales'}))

  dt_col = "date"
  data = add_time_features(data, dt_col).pipe(reduce_mem_usage)
  data = data.sort_values("date")

  print("data shape:", data.shape)
  inputs = data[data.d.isin(list(range(train_a, train_z +1)))]
  cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
  useless_cols = ["id", "date", "sales", "wm_yr_wk", "weekday", 'part']
  train_cols = data.columns[~data.columns.isin(useless_cols)]
  X_train = inputs[train_cols]
  y_train = inputs.sales

  np.random.seed(777)

  fake_valid_inds = np.random.choice(X_train.index.values, int(len(X_train)/5), replace = False)
  train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
  train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                          categorical_feature=cat_feats, free_raw_data=False)
  fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                                categorical_feature=cat_feats,
                  free_raw_data=False)
  max_lags = 85

  testinputs = data[data.d.isin(list(range(test_a - max_lags, test_z+1)))]

  del data, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

  ts = time.time()
  m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 
  print("---Training--- {:.2f}s seconds ---".format(time.time() - ts))


  alpha = 1.02
  print(alpha, end = "  ")
  te = testinputs.copy()
  te.loc[te.d.isin(list(range(test_a, test_z))), 'sales'] = np.NaN
  te = setnan_testlag(te)

  for fday in range(1, 28+1):
    tday = test_a - 1 + fday
    print(tday, end = " ")
    tstX = getlag_fortest(te[(te.d >= tday - max_lags) & (te.d <= tday)].copy(), tday, train_cols)

    te.loc[te.d == tday, "sales"] = \
      alpha * m_lgb.predict(tstX)
    
  te_sub = te.loc[te.d >= test_a, ["id", "sales","d"]].copy()
  te_sub = te_sub.set_index(["id", "d"]).unstack()["sales"]
  te_sub.reset_index(inplace = True)

  num_test_days = test_z - test_a + 1
  fcols = [f"F{i}" for i in range(1, num_test_days+1)]
  te_sub.columns = ['id',] + fcols
  return te_sub


In [8]:
allsubs = []
fullts = time.time()
for subval in sales.store_id.unique():
  print(time.time() - fullts)
  subt = makepreds_persubsample(subval, sales.copy(), prices.copy(), calendar.copy(), submission.copy())
  allsubs.append(subt)

0.001999378204345703
-----------------------------------------------------
-----Investigating: CA_1(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 25.48s seconds ---
Entering Lag
--- 0.97s seconds ---
Entering wins
--- 27.51s seconds ---



Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 2.80346
[40]	valid_0's rmse: 2.49189
[60]	valid_0's rmse: 2.40781
[80]	valid_0's rmse: 2.3807
[100]	valid_0's rmse: 2.36739
[120]	valid_0's rmse: 2.35803
[140]	valid_0's rmse: 2.34945
[160]	valid_0's rmse: 2.34203
[180]	valid_0's rmse: 2.33691
[200]	valid_0's rmse: 2.33289
[220]	valid_0's rmse: 2.32853
[240]	valid_0's rmse: 2.32347
[260]	valid_0's rmse: 2.31848
[280]	valid_0's rmse: 2.3136
[300]	valid_0's rmse: 2.31132
[320]	valid_0's rmse: 2.30858
[340]	valid_0's rmse: 2.30593
[360]	valid_0's rmse: 2.30248
[380]	valid_0's rmse: 2.30083
[400]	valid_0's rmse: 2.29781
---Training--- 400.51s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 463.7139856815338
-----------------------------------------------------
-----Investigating: CA_2(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 488.31s seconds ---
Entering Lag
---


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 2.08545
[40]	valid_0's rmse: 1.89545
[60]	valid_0's rmse: 1.84391
[80]	valid_0's rmse: 1.8275
[100]	valid_0's rmse: 1.82041
[120]	valid_0's rmse: 1.81587
[140]	valid_0's rmse: 1.81157
[160]	valid_0's rmse: 1.80882
[180]	valid_0's rmse: 1.80615
[200]	valid_0's rmse: 1.80396
[220]	valid_0's rmse: 1.80266
[240]	valid_0's rmse: 1.80059
[260]	valid_0's rmse: 1.79947
[280]	valid_0's rmse: 1.79767
[300]	valid_0's rmse: 1.79611
[320]	valid_0's rmse: 1.79472
[340]	valid_0's rmse: 1.79333
[360]	valid_0's rmse: 1.79208
[380]	valid_0's rmse: 1.79095
[400]	valid_0's rmse: 1.78955
---Training--- 411.58s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 935.7488031387329
-----------------------------------------------------
-----Investigating: CA_3(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 960.23s seconds ---
Entering Lag
--


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 4.11381
[40]	valid_0's rmse: 3.59264
[60]	valid_0's rmse: 3.45356
[80]	valid_0's rmse: 3.40671
[100]	valid_0's rmse: 3.37745
[120]	valid_0's rmse: 3.35988
[140]	valid_0's rmse: 3.34723
[160]	valid_0's rmse: 3.32752
[180]	valid_0's rmse: 3.31394
[200]	valid_0's rmse: 3.30378
[220]	valid_0's rmse: 3.28965
[240]	valid_0's rmse: 3.28545
[260]	valid_0's rmse: 3.28029
[280]	valid_0's rmse: 3.274
[300]	valid_0's rmse: 3.26719
[320]	valid_0's rmse: 3.25903
[340]	valid_0's rmse: 3.25171
[360]	valid_0's rmse: 3.24699
[380]	valid_0's rmse: 3.2421
[400]	valid_0's rmse: 3.23724
---Training--- 398.24s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1394.231544494629
-----------------------------------------------------
-----Investigating: CA_4(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 1418.74s seconds ---
Entering Lag
---


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 1.5954
[40]	valid_0's rmse: 1.47923
[60]	valid_0's rmse: 1.44929
[80]	valid_0's rmse: 1.44092
[100]	valid_0's rmse: 1.43756
[120]	valid_0's rmse: 1.4346
[140]	valid_0's rmse: 1.43242
[160]	valid_0's rmse: 1.43028
[180]	valid_0's rmse: 1.42897
[200]	valid_0's rmse: 1.42777
[220]	valid_0's rmse: 1.42698
[240]	valid_0's rmse: 1.42609
[260]	valid_0's rmse: 1.42524
[280]	valid_0's rmse: 1.4243
[300]	valid_0's rmse: 1.42347
[320]	valid_0's rmse: 1.42298
[340]	valid_0's rmse: 1.42248
[360]	valid_0's rmse: 1.42182
[380]	valid_0's rmse: 1.42122
[400]	valid_0's rmse: 1.42063
---Training--- 424.45s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1879.5834376811981
-----------------------------------------------------
-----Investigating: TX_1(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 1905.20s seconds ---
Entering Lag
--


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 2.37746
[40]	valid_0's rmse: 2.15142
[60]	valid_0's rmse: 2.09233
[80]	valid_0's rmse: 2.07388
[100]	valid_0's rmse: 2.06724
[120]	valid_0's rmse: 2.0619
[140]	valid_0's rmse: 2.05749
[160]	valid_0's rmse: 2.05224
[180]	valid_0's rmse: 2.04719
[200]	valid_0's rmse: 2.04006
[220]	valid_0's rmse: 2.03704
[240]	valid_0's rmse: 2.03444
[260]	valid_0's rmse: 2.03263
[280]	valid_0's rmse: 2.02839
[300]	valid_0's rmse: 2.02638
[320]	valid_0's rmse: 2.02516
[340]	valid_0's rmse: 2.02309
[360]	valid_0's rmse: 2.02126
[380]	valid_0's rmse: 2.02
[400]	valid_0's rmse: 2.01725
---Training--- 411.44s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 2353.9972846508026
-----------------------------------------------------
-----Investigating: TX_2(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 2379.79s seconds ---
Entering Lag
---


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 2.83526
[40]	valid_0's rmse: 2.49053
[60]	valid_0's rmse: 2.3981
[80]	valid_0's rmse: 2.37143
[100]	valid_0's rmse: 2.36627
[120]	valid_0's rmse: 2.35904
[140]	valid_0's rmse: 2.35476
[160]	valid_0's rmse: 2.34711
[180]	valid_0's rmse: 2.34073
[200]	valid_0's rmse: 2.3352
[220]	valid_0's rmse: 2.33256
[240]	valid_0's rmse: 2.32862
[260]	valid_0's rmse: 2.32413
[280]	valid_0's rmse: 2.31968
[300]	valid_0's rmse: 2.31423
[320]	valid_0's rmse: 2.3102
[340]	valid_0's rmse: 2.30643
[360]	valid_0's rmse: 2.30319
[380]	valid_0's rmse: 2.30093
[400]	valid_0's rmse: 2.29651
---Training--- 404.48s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 2821.203385591507
-----------------------------------------------------
-----Investigating: TX_3(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 2845.98s seconds ---
Entering Lag
---


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 2.53319
[40]	valid_0's rmse: 2.23009
[60]	valid_0's rmse: 2.15112
[80]	valid_0's rmse: 2.12797
[100]	valid_0's rmse: 2.11579
[120]	valid_0's rmse: 2.10979
[140]	valid_0's rmse: 2.10203
[160]	valid_0's rmse: 2.09778
[180]	valid_0's rmse: 2.09179
[200]	valid_0's rmse: 2.08635
[220]	valid_0's rmse: 2.08313
[240]	valid_0's rmse: 2.08063
[260]	valid_0's rmse: 2.07873
[280]	valid_0's rmse: 2.07463
[300]	valid_0's rmse: 2.07205
[320]	valid_0's rmse: 2.06874
[340]	valid_0's rmse: 2.06613
[360]	valid_0's rmse: 2.06369
[380]	valid_0's rmse: 2.05996
[400]	valid_0's rmse: 2.05684
---Training--- 413.38s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 3296.376686811447
-----------------------------------------------------
-----Investigating: WI_1(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 3320.79s seconds ---
Entering Lag



Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 1.89121
[40]	valid_0's rmse: 1.71143
[60]	valid_0's rmse: 1.66283
[80]	valid_0's rmse: 1.64781
[100]	valid_0's rmse: 1.64004
[120]	valid_0's rmse: 1.6346
[140]	valid_0's rmse: 1.63005
[160]	valid_0's rmse: 1.62689
[180]	valid_0's rmse: 1.62447
[200]	valid_0's rmse: 1.62271
[220]	valid_0's rmse: 1.62064
[240]	valid_0's rmse: 1.61887
[260]	valid_0's rmse: 1.61723
[280]	valid_0's rmse: 1.61587
[300]	valid_0's rmse: 1.61456
[320]	valid_0's rmse: 1.61311
[340]	valid_0's rmse: 1.61124
[360]	valid_0's rmse: 1.60991
[380]	valid_0's rmse: 1.60888
[400]	valid_0's rmse: 1.60772
---Training--- 405.84s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 3762.764197587967
-----------------------------------------------------
-----Investigating: WI_2(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 3787.30s seconds ---
Entering Lag
-


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 3.11446
[40]	valid_0's rmse: 2.83431
[60]	valid_0's rmse: 2.75884
[80]	valid_0's rmse: 2.73342
[100]	valid_0's rmse: 2.72065
[120]	valid_0's rmse: 2.70901
[140]	valid_0's rmse: 2.69816
[160]	valid_0's rmse: 2.69158
[180]	valid_0's rmse: 2.68359
[200]	valid_0's rmse: 2.67554
[220]	valid_0's rmse: 2.66943
[240]	valid_0's rmse: 2.66423
[260]	valid_0's rmse: 2.65745
[280]	valid_0's rmse: 2.65105
[300]	valid_0's rmse: 2.64615
[320]	valid_0's rmse: 2.64159
[340]	valid_0's rmse: 2.6379
[360]	valid_0's rmse: 2.63425
[380]	valid_0's rmse: 2.62951
[400]	valid_0's rmse: 2.62442
---Training--- 379.26s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 4202.119118213654
-----------------------------------------------------
-----Investigating: WI_3(3049, 1158)
(60980, 29) editing (3049, 29)
Adding Evaluation Columns
---Melt and Merged --- 4226.47s seconds ---
Entering Lag
-


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



data shape: (3597820, 36)



Found `num_iterations` in params. Will use it instead of argument


Using categorical_feature in Dataset.


categorical_feature in param dict is overridden.



[20]	valid_0's rmse: 2.46189
[40]	valid_0's rmse: 2.17388
[60]	valid_0's rmse: 2.09785
[80]	valid_0's rmse: 2.07142
[100]	valid_0's rmse: 2.05783
[120]	valid_0's rmse: 2.04832
[140]	valid_0's rmse: 2.04001
[160]	valid_0's rmse: 2.03243
[180]	valid_0's rmse: 2.02515
[200]	valid_0's rmse: 2.01869
[220]	valid_0's rmse: 2.01218
[240]	valid_0's rmse: 2.00954
[260]	valid_0's rmse: 2.00373
[280]	valid_0's rmse: 1.99854
[300]	valid_0's rmse: 1.99491
[320]	valid_0's rmse: 1.991
[340]	valid_0's rmse: 1.98792
[360]	valid_0's rmse: 1.98494
[380]	valid_0's rmse: 1.98118
[400]	valid_0's rmse: 1.97782
---Training--- 403.71s seconds ---
1.02  1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 

In [16]:
subf = pd.concat(allsubs)
sub2 = subf.copy()
sub2["id"] = sub2["id"].str.replace("evaluation$", "validation")
## submission requires an evaluation and validation period, only validation is scored
sub_final = pd.concat([sub2, subf], axis=0, sort=False)
sub_final.to_csv('submission.csv', index = False)

In [17]:
sub_final

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.805944,0.698793,0.669990,0.670222,0.756504,0.849930,0.740598,0.617947,0.693704,0.848059,0.730552,1.004943,1.163338,1.113720,0.880755,0.909825,0.800699,0.823179,0.932566,1.034605,0.993411,0.768044,0.694400,0.715742,0.699942,0.876039,1.016198,0.905801
1,FOODS_1_002_CA_1_validation,0.519184,0.497519,0.460744,0.496724,0.591623,0.693767,0.658139,0.560458,0.493675,0.516667,0.439136,0.589707,0.665047,0.642738,0.462435,0.451906,0.479707,0.498081,0.536791,0.734719,0.735620,0.529822,0.516008,0.511108,0.526452,0.600936,0.739214,0.645907
2,FOODS_1_003_CA_1_validation,0.786312,0.582529,0.609231,0.645934,0.723629,0.859001,0.872280,0.836373,0.617976,0.731273,0.603175,0.866376,1.023086,1.030001,0.887638,0.678062,0.688682,0.720757,0.847600,0.941130,0.872221,0.838333,0.604885,0.609438,0.687963,0.776054,0.936262,0.771897
3,FOODS_1_004_CA_1_validation,2.988702,3.071787,2.918010,2.952508,3.941592,4.816790,4.863614,4.910433,4.216246,3.582169,3.644052,3.873744,4.208416,4.147287,3.220568,2.996899,3.047585,3.043344,3.682558,4.320133,4.212607,3.314375,3.309139,3.256172,3.356370,3.840332,4.377355,4.461570
4,FOODS_1_005_CA_1_validation,1.366127,1.092656,0.993736,0.954736,1.048404,1.331253,1.407933,1.294835,1.607762,1.156600,1.869463,1.337143,1.727253,1.653949,1.309180,1.353623,1.155900,1.116086,1.333224,1.809925,1.603400,1.326538,0.959745,1.005175,1.001749,1.359114,1.797425,1.887245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,HOUSEHOLD_2_512_WI_3_evaluation,0.377480,0.346123,0.369416,0.353446,0.365587,0.339732,0.331319,0.347365,0.298963,0.297666,0.312822,0.420709,0.367257,0.345777,0.393157,0.334004,0.332374,0.347948,0.369076,0.360348,0.370508,0.347073,0.332509,0.328361,0.314211,0.340464,0.329964,0.305732
3045,HOUSEHOLD_2_513_WI_3_evaluation,0.286599,0.277683,0.268564,0.261102,0.250275,0.284769,0.266242,0.265593,0.287257,0.319288,0.269933,0.341644,0.327585,0.320281,0.254084,0.258758,0.266586,0.309483,0.321667,0.320536,0.364631,0.302418,0.265725,0.261971,0.245993,0.350472,0.320559,0.310134
3046,HOUSEHOLD_2_514_WI_3_evaluation,0.091176,0.090391,0.089907,0.089907,0.121104,0.126849,0.137523,0.093282,0.090639,0.117596,0.109256,0.145512,0.122557,0.141738,0.118224,0.106578,0.106279,0.119099,0.121385,0.152226,0.142681,0.095301,0.097133,0.108268,0.085025,0.106356,0.114444,0.098036
3047,HOUSEHOLD_2_515_WI_3_evaluation,0.211371,0.241319,0.217724,0.223765,0.297317,0.318444,0.272201,0.238742,0.211907,0.257403,0.225462,0.292667,0.292274,0.278927,0.233726,0.213882,0.209682,0.217631,0.283932,0.287776,0.318290,0.251883,0.213702,0.208656,0.207377,0.281128,0.291955,0.259932
