### Simple ML approach through ensemble of XGBoost and LightGBM models
This approach achieved a 12.67808 SMAPE on the private leaderboard (93/462)

Start by loading into the environment the libraries needed

In [None]:
library(lightgbm) # Hard installation
library(caret)

Load the data

In [None]:
trainRaw <- read.csv("../Data/train.csv")
testRaw <- read.csv("../Data/test.csv")

You just need to do this if you execute this code in a Kaggle's kernel. The problem is related with a discrepancy between the number of threads on the Kaggle's machine and the selected by the boosting algorithms

In [None]:
library(OpenMPController)
omp_set_num_threads(4)

Now let's preprocess the data

In [None]:
auxDF <- rbind.data.frame(trainRaw[, -ncol(trainRaw)], # Auxiliar DF
                          testRaw[, -1]) 

auxDF$date <- as.character(auxDF$date)
auxDF$day <- as.numeric(substr(auxDF$date, 9, 10)) # Day of month
auxDF$month <- as.numeric(substr(auxDF$date, 6, 7)) # Month
auxDF$year <- as.numeric(substr(auxDF$date, 1, 4)) # Year
auxDF$dayWeek <- as.numeric(as.factor(weekdays(as.Date(auxDF$date)))) # Day of week
auxDF$date <- NULL

# Quarter
auxDF$quarter <- auxDF$month / 3 
auxDF$quarter <- ifelse(auxDF$quarter <= 1, 1, auxDF$quarter)
auxDF$quarter <- ifelse(auxDF$quarter > 1 & auxDF$quarter <= 2, 2, auxDF$quarter)
auxDF$quarter <- ifelse(auxDF$quarter > 2 & auxDF$quarter <= 3, 3, auxDF$quarter)
auxDF$quarter <- ifelse(auxDF$quarter > 3 & auxDF$quarter <= 4, 4, auxDF$quarter)

# Select a useful subset of features
train <- auxDF[1:nrow(trainRaw), ]
train$sales <- trainRaw$sales
train <- train[-which(train$day == 29 & train$month == 2), ]
train$day <- NULL

# Same to the test DF
test <- auxDF[(nrow(trainRaw) + 1):nrow(auxDF), ]
test$id <- testRaw$id
test <- test[, c(8, 1:7)]
test$day <- NULL

We need to define the SMAPE functions in order to provide them as loss functions to the boosting algorithms

In [None]:
SMAPEcaret <- function(data, lev = NULL, model = NULL) {
  
  smapeCaret <- Metrics::smape(data$obs, data$pred)
  c(SMAPEcaret = -smapeCaret)
  
}

SMAPElightgbm <- function(preds, dtrain){
  
  actual <- getinfo(dtrain, "label")
  score <- Metrics::smape(preds, actual)
  
  return(list(name = "SMAPE", value = score, higher_better = FALSE))
  
}

Now we will set the optimal hyperparameters for the XGB model

In [None]:
modelsXGB <- list() # List to save the models for the different items

fitControl <- trainControl(method = 'none',
                           summaryFunction = SMAPEcaret)

parametersXGB <- expand.grid(nrounds = 100, 
                             max_depth = 6,
                             eta = 0.2, 
                             gamma = 0.6, 
                             colsample_bytree = 0.5,
                             min_child_weight = 2, 
                             subsample = 1)

Same for the lightGBM model

In [None]:
modelsLGB <- list() # List to save the models for the different items

lgbGrid  <-  list(objective = "regression", 
                  min_sum_hessian_in_leaf = 1, 
                  feature_fraction = 0.7, 
                  bagging_fraction = 0.7, 
                  bagging_freq = 5, 
                  max_bin = 50, 
                  lambda_l1 = 1, 
                  lambda_l2 = 1.3, 
                  min_data_in_bin = 100, 
                  min_gain_to_split = 10, 
                  min_data_in_leaf = 30)

Fit the model for each item through a for loop

In [None]:
for(item in 1:50) {
  
  cat(item, ' | 50', sep = '') # Progress info
  
  auxTrain <- train[which(train$item == item), ]
  auxTrain$item <- NULL
  
  # XGBoost
  modelsXGB[[item]] <- train(x = auxTrain[, -ncol(auxTrain)],
                             y = auxTrain[, ncol(auxTrain)],
                             method = 'xgbTree',
                             trControl = fitControl,
                             tuneGrid = parametersXGB,
                             metric = 'SMAPEcaret')
  
  # LightGBM
  label <- auxTrain$sales
  auxTrainMatrix <- Matrix::Matrix(as.matrix(auxTrain[, -ncol(auxTrain)]), sparse = TRUE)
  
  auxTrainLGB <- lgb.Dataset(data = auxTrainMatrix, label = label)
  
  modelsLGB[[item]] <- lgb.train(params = lgbGrid, data = auxTrainLGB, learning_rate = 0.1,
                                 num_leaves = 10, num_threads = 3, nrounds = 500,
                                 eval_freq = 20, eval = SMAPElightgbm, verbose = -1)
  
}

Generate the predictions

In [None]:
for(item in 1:50) {
  
  cat(item, ' | 50', sep = '')
  
  indices <- which(test$item == item)
  auxTest <- test[indices, ]
  auxTest$item <- NULL
  
  # XGBoost
  auxXGB <- predict(modelsXGB[[item]], auxTest[, -1])
  
  # LightGBM
  auxTest <- Matrix::Matrix(as.matrix(auxTest[, -1]), sparse = TRUE)
  
  auxLGB <- predict(modelsLGB[[item]], auxTest)
  
  predsXGB <- c(predsXGB, auxXGB)
  predsLGB <- c(predsLGB, auxLGB)
  
}

Multiply by the "magic number" in order to improve a little the predictions, then do the ensemble

In [None]:
predsXGB <- predsXGB * 1.03
predsLGB <- predsLGB * 1.03
test$sales <- rowMeans(data.frame(predsXGB, predsLGB))

Generate the submission file

In [None]:
submission <- read.csv('../Data/sample_submission.csv')
submission$sales <- test$sales
write.csv(submission, 'lgb_xgb_ensemble.csv', row.names = FALSE)