### XGBoost Classifier

In [None]:
install.packages("xgboost", lib="/opt/conda/lib/R/library", repo="http://cran.us.r-project.org")
install.packages("cvAUC", lib="/opt/conda/lib/R/library", repo="http://cran.us.r-project.org")

### Data preparation

In [None]:
library(xgboost)
library(Matrix)
library(cvAUC)
# Load 2-class HIGGS dataset
train <- read.csv("data/higgs_train_10k.csv")
test <- read.csv("data/higgs_test_5k.csv")
# Set seed because we column-sample
set.seed(1)


head(train)
train$EventId<-NULL
test$EventId<-NULL
train$Weight<-NULL
test$Weight<-NULL

train$response=ifelse(train$Label=="s",1,0)
test$response=ifelse(test$Label=="s",1,0)
head(train)
head(test)
train$Label<-NULL
test$label<-NULL
y <- "response"

### Run model and test it

In [None]:
train.mx <- sparse.model.matrix(response ~ ., train)
test.mx <- sparse.model.matrix(response ~ ., test)
dtrain <- xgb.DMatrix(train.mx, label = train[,y])
dtest <- xgb.DMatrix(test.mx, label = test[,y])

train.gdbt <- xgb.train(params = list(objective = "binary:logistic",
                                      #num_class = 2,
                                      #eval_metric = "mlogloss",
                                      eta = 0.3,
                                      max_depth = 5,
                                      subsample = 1,
                                      colsample_bytree = 0.5), 
                                      data = dtrain, 
                                      nrounds = 70, 
                                      watchlist = list(train = dtrain, test = dtest))
# Generate predictions on test dataset
preds <- predict(train.gdbt, newdata = dtest)
labels <- test[,y]

# Compute AUC on the test set
cvAUC::AUC(predictions = preds, labels = labels)


### Advanced functionality of xgboost

In [None]:

install.packages("Ckmeans.1d.dp", lib="/opt/conda/lib/R/library", repo="http://cran.us.r-project.org")
library(Ckmeans.1d.dp)
# Compute feature importance matrix
names <- dimnames(data.matrix(train[,-1]))[[2]]
importance_matrix <- xgb.importance(names, model = train.gdbt)

# Plot feature importance
xgb.plot.importance(importance_matrix[1:10,])