# Example 1 Custom parameters in caret package

### Loading in relevant package and register cores for parallelization

In [1]:
library(caret)
library(doParallel)
library(readr)
library(randomForest)
library(dplyr)

registerDoParallel(cores=4)

setwd("D:/Project_2017/Training_0331")

Loading required package: lattice
Loading required package: ggplot2
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel
randomForest 4.6-10
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:ggplot2':

    margin


Attaching package: 'dplyr'

The following object is masked from 'package:randomForest':

    combine

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



### Reading in the data
#### The data is the Titanic survival data which is from Kaggle

In [2]:
datafile <- './data_for_testing.csv'
data <- readr::read_csv(datafile)
head(data)

y,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,1,40.0,0,0,27.7208,0
1,1,0,29.9,1,0,146.5208,0
0,2,1,66.0,0,0,10.5,2
0,1,1,42.0,1,0,52.0,2
1,2,0,5.0,1,2,27.75,2
1,3,1,29.9,1,1,15.2458,0


#### 1. As we will do classification, we need to convert the label to factor

In [3]:
data$response <- ifelse(data$y==1, 'S', 'D')
data$response <- as.factor(data$response)

data2 <- data %>% select(-c(y))
head(data2)

Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,response
1,1,40.0,0,0,27.7208,0,D
1,0,29.9,1,0,146.5208,0,S
2,1,66.0,0,0,10.5,2,D
1,1,42.0,1,0,52.0,2,D
2,0,5.0,1,2,27.75,2,S
3,1,29.9,1,1,15.2458,0,S


#### 2. Create our own functions to pass tree and nodesize in random forest

In [4]:
fullRF <- getModelInfo(model='rf', regex=FALSE)[[1]]
fullRF 

$label
[1] "Random Forest"

$library
[1] "randomForest"

$loop
NULL

$type
[1] "Classification" "Regression"    

$parameters
  parameter   class                         label
1      mtry numeric #Randomly Selected Predictors

$grid
function (x, y, len = NULL) 
{
    data.frame(mtry = caret::var_seq(p = ncol(x), classification = is.factor(y), 
        len = len))
}

$fit
function (x, y, wts, param, lev, last, classProbs, ...) 
randomForest(x, y, mtry = param$mtry, ...)

$predict
function (modelFit, newdata, submodels = NULL) 
predict(modelFit, newdata)

$prob
function (modelFit, newdata, submodels = NULL) 
predict(modelFit, newdata, type = "prob")

$predictors
function (x, ...) 
{
    varIndex <- as.numeric(names(table(x$forest$bestvar)))
    varIndex <- varIndex[varIndex > 0]
    varsUsed <- names(x$forest$ncat)[varIndex]
    varsUsed
}

$varImp
function (object, ...) 
{
    varImp <- randomForest::importance(object, ...)
    if (object$type == "regression") 
        varImp <- data.fr

In [5]:
#parameter
prm <- data.frame(parameter = c('ntree', 'mtry', 'nodesize'),
                  class = rep('numeric',3),
                  label = c('ntree', 'mtry', 'nodesize'))

fullRF$parameters <- prm

#redefine the model function
RFfit <- function(x, y, wts, param, lev, last, weights, classProbs, ...){
  
  randomForest(x = x, y = y, 
               ntree = param$ntree, 
               mtry = param$mtry,
               nodesize = param$nodesize,...)
  
}

fullRF$fit <- RFfit

#### 3. Test our new functions in `train`

In [6]:
ctrl <- trainControl(method='cv', number=3, classProbs=TRUE, 
                     summaryFunction = twoClassSummary)

grid <- expand.grid(ntree = c(10, 50), mtry=c(5, 6), nodesize = c(50, 80))


tm1 <- Sys.time()
rf_re <- caret::train(response~., data=data2, method=fullRF, tuneGrid=grid,
                 trControl=ctrl,metric='ROC')
                 #, preProc=c('center', 'scale'))
  
print( Sys.time()-tm1)


Time difference of 5.518552 secs


In [7]:
rf_re

Random Forest 

1309 samples
   7 predictor
   2 classes: 'D', 'S' 

No pre-processing
Resampling: Cross-Validated (3 fold) 
Summary of sample sizes: 873, 872, 873 
Resampling results across tuning parameters:

  ntree  mtry  nodesize  ROC        Sens       Spec       ROC SD    
  10     5     50        0.8724670  0.9213219  0.7598344  0.01597703
  10     5     80        0.8737246  0.9140448  0.7805383  0.01392912
  10     6     50        0.8725496  0.9201054  0.7660455  0.02929949
  10     6     80        0.8691415  0.9201098  0.7743271  0.02221739
  50     5     50        0.8900877  0.9188933  0.7743271  0.02816517
  50     5     80        0.8767199  0.9176856  0.7826087  0.02607718
  50     6     50        0.8860131  0.9176812  0.7763975  0.02220675
  50     6     80        0.8791604  0.9164778  0.7743271  0.01453076
  Sens SD     Spec SD   
  0.02746216  0.06032630
  0.02728651  0.06096245
  0.01918285  0.05209004
  0.02203858  0.05635883
  0.01712991  0.05567010
  0.01786070  0.05

In [8]:
grid

ntree,mtry,nodesize
10,5,50
50,5,50
10,6,50
50,6,50
10,5,80
50,5,80
10,6,80
50,6,80
