# Example 2 Custom measures in caret package

### Loading in relevant package and register cores for parallelization

In [1]:
library(caret)
library(doParallel)
library(readr)
library(dplyr)
library(xgboost)

registerDoParallel(cores=4)

setwd("D:/Project_2017/Training_0331")

Loading required package: lattice
Loading required package: ggplot2
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'xgboost' was built under R version 3.3.3"
Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice



### Reading in the data
#### The data is the Titanic survival data which is from Kaggle

In [2]:
datafile <- './data_for_testing.csv'
data <- readr::read_csv(datafile)
head(data)

y,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,1,40.0,0,0,27.7208,0
1,1,0,29.9,1,0,146.5208,0
0,2,1,66.0,0,0,10.5,2
0,1,1,42.0,1,0,52.0,2
1,2,0,5.0,1,2,27.75,2
1,3,1,29.9,1,1,15.2458,0


#### 1. As we will do classification, we need to convert the label to factor

In [3]:
data$response <- ifelse(data$y==1, 'S', 'D')
data$response <- as.factor(data$response)

data2 <- data %>% select(-c(y))
head(data2)

Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,response
1,1,40.0,0,0,27.7208,0,D
1,0,29.9,1,0,146.5208,0,S
2,1,66.0,0,0,10.5,2,D
1,1,42.0,1,0,52.0,2,D
2,0,5.0,1,2,27.75,2,S
3,1,29.9,1,1,15.2458,0,S


#### 2. Create our own function to pass measure to the tuning

In [4]:
twoClassSummary

In [5]:
ppvAtGivenRecall <- function(data,  lev = NULL, model = NULL){
  lvls <- levels(data$obs)
  if (length(lvls) > 2) 
    stop(paste("Your outcome has", length(lvls), "levels. The ppvAtGivenRecall() function isn't appropriate."))
  #requireNamespaceQuietStop("ModelMetrics")
  if (!all(levels(data[, "pred"]) == lvls)) 
    stop("levels of observed and predicted data do not match")
  data$y <- as.numeric(data$obs == lvls[2])
  
  aucobj <- ROCR::prediction(data[, lvls[1]], ifelse(data$obs == lev[2], 0,1)) 

  ppvRec <- ROCR::performance(aucobj, 'ppv', 'sens')
  
  tarPPV <- ppvRec@y.values[[1]][which.min(abs(ppvRec@x.values[[1]]-0.2))]
  selRec <- ppvRec@x.values[[1]][which.min(abs(ppvRec@x.values[[1]]-0.2))]

  out <- c(tarPPV, selRec, 0.2)
  names(out) <- c("PPVAtGivenRecall", "SelectedRecall", "GivenRecall")
  out
}


#### 3. Test our new functions in `train`

In [6]:
ctrl <- trainControl(method='cv', number=3, classProbs=TRUE, 
                     summaryFunction = ppvAtGivenRecall)


grid <- expand.grid(nrounds= c(100, 250),
                   max_depth = c(5, 6), 
                  eta = c(0.1, 0.5)
                  )



tm1 <- Sys.time()
xgb_re <- caret::train(response~., data = data2, method="xgbTree", 
                tuneGrid=grid, 
                verbose=T,
               trControl=ctrl,metric='PPVAtGivenRecall')


print( Sys.time()-tm1)


Loading required package: plyr
------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------

Attaching package: 'plyr'

The following objects are masked from 'package:dplyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize



Time difference of 7.968797 secs


In [7]:
xgb_re

eXtreme Gradient Boosting 

1309 samples
   7 predictor
   2 classes: 'D', 'S' 

No pre-processing
Resampling: Cross-Validated (3 fold) 
Summary of sample sizes: 872, 873, 873 
Resampling results across tuning parameters:

  eta  max_depth  nrounds  PPVAtGivenRecall  SelectedRecall  GivenRecall
  0.1  5          100      0.9277394         0.1997585       0.2        
  0.1  5          250      0.9431064         0.1997585       0.2        
  0.1  6          100      0.9181983         0.1997585       0.2        
  0.1  6          250      0.9327485         0.1997585       0.2        
  0.5  5          100      0.9272031         0.1997585       0.2        
  0.5  5          250      0.9375609         0.1997585       0.2        
  0.5  6          100      0.9377395         0.1997585       0.2        
  0.5  6          250      0.9436710         0.1997585       0.2        
  PPVAtGivenRecall SD  SelectedRecall SD  GivenRecall SD
  0.033057839          0.0004183698       0             
  0.01