In [23]:
import lightgbm as lgbm
import numpy as np
import pandas as pd

# TODO: correct data types, more variables, see about scaling, alphas , magic multipliers

# define data types for the sales data
numCols = [f"d_{day}" for day in range(1, 1914)]

# Define all categorical columns
catCols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']

# Define the correct data types for "sales_train_validation.csv"
dtype = {numCol: "float32" for numCol in numCols}
dtype.update({catCol: "category" for catCol in catCols if catCol != "id"})

# read the sales data
sales_data = pd.read_csv("/media/hhew0002/f0df6edb-45fe-4416-8076-34757a0abceb/hhew0002/Academic/Competitions/M5 Competition/data/sales_train_validation.csv", usecols=catCols + numCols, dtype=dtype)
sales_data = sales_data.drop(columns=["item_id", "dept_id", "store_id"])

In [8]:
# read the calendar data
calendarDTypes = {"event_name_1": "category",
                  "event_name_2": "category",
                  "event_type_1": "category",
                  "event_type_2": "category",
                  "weekday": "category",
                  'wm_yr_wk': 'int16',
                  "wday": "int16",
                  "month": "int16",
                  "year": "int16",
                  "snap_CA": "float32",
                  'snap_TX': 'float32',
                  'snap_WI': 'float32' }
calendar_data = pd.read_csv("/media/hhew0002/f0df6edb-45fe-4416-8076-34757a0abceb/hhew0002/Academic/Competitions/M5 Competition/data/calendar.csv", dtype=calendarDTypes)

# remove the last 28 points from the calendar data
horizon = 28
calendar_data = calendar_data.iloc[:-horizon, :]


for col, colDType in calendarDTypes.items():
    if colDType == "category":
        calendar_data[col] = calendar_data[col].cat.codes.astype("int16")
        calendar_data[col] -= calendar_data[col].min()

# define function for adding holiday column
def add_holiday_value(row):
    if row['event_name_1'] == 0:
        return 0
    else:
        return 1

calendar_data['holiday'] = calendar_data.apply (lambda row: add_holiday_value(row), axis=1)
calendar_data = calendar_data.drop(columns=["date", "wm_yr_wk", "year"])
calendar_data_train = calendar_data.iloc[:-horizon, :]
calendar_data_test = calendar_data.iloc[-horizon:, :]

In [24]:
# embed the sales data to have a lag length of 10
data = sales_data.iloc[0:10, :]
data = pd.melt(data, id_vars=['id', 'cat_id', 'state_id'], var_name="d")

for lags in range(1,11):
    data['lag' + str(lags)] = data[["id","value"]].groupby("id")["value"].shift(lags)

In [25]:
# merge the calendar data into the sales data
full_training_data = data.merge(calendar_data_train, on="d", copy=False)
full_training_data = full_training_data.dropna()
full_training_data = full_training_data.drop(columns=["d", "id"])
full_training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19030 entries, 100 to 19129
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   cat_id        19030 non-null  category
 1   state_id      19030 non-null  category
 2   value         19030 non-null  float32 
 3   lag1          19030 non-null  float32 
 4   lag2          19030 non-null  float32 
 5   lag3          19030 non-null  float32 
 6   lag4          19030 non-null  float32 
 7   lag5          19030 non-null  float32 
 8   lag6          19030 non-null  float32 
 9   lag7          19030 non-null  float32 
 10  lag8          19030 non-null  float32 
 11  lag9          19030 non-null  float32 
 12  lag10         19030 non-null  float32 
 13  weekday       19030 non-null  int16   
 14  wday          19030 non-null  int16   
 15  month         19030 non-null  int16   
 16  event_name_1  19030 non-null  int16   
 17  event_type_1  19030 non-null  int16   
 18  even

In [26]:
# create the training x and y data
full_training_data_x = full_training_data.drop(columns=["value"])
full_training_data_y = full_training_data.value

# seperate the training and validation data
np.random.seed(777)

# Define categorical features
cat_features = ['cat_id', 'state_id', 'holiday', 'weekday', 'wday', 'month', 'event_name_1', 'event_name_2', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI']

# validation data
validation_indices = np.random.choice(full_training_data_x.index.values, 100, replace = False)
training_indices = np.setdiff1d(full_training_data_x.index.values, validation_indices)

training_data = lgbm.Dataset(full_training_data_x.loc[training_indices], label = full_training_data_y.loc[training_indices],
                        categorical_feature = cat_features, free_raw_data = False)
valid_data = lgbm.Dataset(full_training_data_x.loc[validation_indices], label = full_training_data_y.loc[validation_indices],
                        categorical_feature = cat_features, free_raw_data = False)

In [27]:
#build the lightgbm model
params = {
          "objective" : "poisson",
          "metric" :"rmse",
          "force_row_wise" : True,
          "learning_rate" : 0.075,
          "sub_row" : 0.75,
          "bagging_freq" : 1,
          "lambda_l2" : 0.1,
          "metric": ["rmse"],
          'verbosity': 1,
          'num_iterations' : 1200,
          'num_leaves': 128,
          "min_data_in_leaf": 100,
         }

# Train LightGBM model
m_lgb = lgbm.train(params, training_data, valid_sets = [valid_data], verbose_eval = 20)

# save the model if needed
m_lgb.save_model("model.lgb")



[20]	valid_0's rmse: 2.15302
[40]	valid_0's rmse: 2.04992
[60]	valid_0's rmse: 2.08962
[80]	valid_0's rmse: 2.12971
[100]	valid_0's rmse: 2.19212
[120]	valid_0's rmse: 2.20748
[140]	valid_0's rmse: 2.17926
[160]	valid_0's rmse: 2.19511
[180]	valid_0's rmse: 2.19599
[200]	valid_0's rmse: 2.1883
[220]	valid_0's rmse: 2.18099
[240]	valid_0's rmse: 2.19105
[260]	valid_0's rmse: 2.20047
[280]	valid_0's rmse: 2.20931
[300]	valid_0's rmse: 2.21483
[320]	valid_0's rmse: 2.22799
[340]	valid_0's rmse: 2.24055
[360]	valid_0's rmse: 2.23547
[380]	valid_0's rmse: 2.23116
[400]	valid_0's rmse: 2.23014
[420]	valid_0's rmse: 2.24022
[440]	valid_0's rmse: 2.24383
[460]	valid_0's rmse: 2.21848
[480]	valid_0's rmse: 2.19552
[500]	valid_0's rmse: 2.20172
[520]	valid_0's rmse: 2.20291
[540]	valid_0's rmse: 2.20052
[560]	valid_0's rmse: 2.19437
[580]	valid_0's rmse: 2.18941
[600]	valid_0's rmse: 2.17736
[620]	valid_0's rmse: 2.17217
[640]	valid_0's rmse: 2.1676
[660]	valid_0's rmse: 2.16736
[680]	valid_0's 

<lightgbm.basic.Booster at 0x7f38f03696d0>

In [58]:
# create the initial testing data
lag_size = 10
testing_data = sales_data.iloc[0:10, -lag_size:]
testing_data['cat_id'] = sales_data.cat_id.iloc[0:10]
testing_data['state_id'] = sales_data.state_id.iloc[0:10]
testing_data['id'] = sales_data.id.iloc[0:10]

for day in range(1914, 1942):
    testing_data[f"d_{day}"] = np.nan

testing_data = pd.melt(testing_data, id_vars=['id', 'cat_id', 'state_id'], var_name="d")

# embed the sales data to have a lag length of 10
for lags in range(1,11):
    testing_data['lag' + str(lags)] = testing_data[["id","value"]].groupby("id")["value"].shift(lags)

# merge the calendar data into the sales data
testing_data = testing_data.merge(calendar_data, on="d", copy=False)

In [63]:
alphas = [1.028]
weights = [1/len(alphas)] * len(alphas)

# perform the recursive the forecasting
# current
for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    cols = [f"F{i}" for i in range(1,29)]
    for prediction_point in range(1914, 1942):
        current_testing_data = testing_data[testing_data["d"].str.split("d_").str[1].astype(int) == prediction_point]
        current_testing_data = current_testing_data.drop(columns=["id", "d", "value"])
        prediction = 1.028 * m_lgb.predict(current_testing_data)

        # add the most recent prediction back to the testing data
        testing_data.loc[testing_data["d"].str.split("d_").str[1].astype(int) == prediction_point, "value"] = prediction

        # recreate the lags
        for lags in range(1,11):
            testing_data['lag' + str(lags)] = testing_data[["id","value"]].groupby("id")["value"].shift(lags)


    te_sub = testing_data.loc[testing_data["d"].str.split("d_").str[1].astype(int) >= 1914, ["id", "value"]].copy()
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["value"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)

0 1.028 1.0


0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
8   NaN
9   NaN
Name: value, dtype: float64