<div >
<img src = "../banner.jpg" />
</div>

# Bagging 

## Recap trees

In [None]:
#Cargar librerías 
require("pacman")
p_load(tidyverse,rpart,caret)


#Leer los datos 
credit <- readRDS(url("https://github.com/ignaciomsarmiento/datasets/blob/main/credit_class.rds?raw=true"))

#mutación de factores
credit <- credit %>% mutate(Default=factor(Default,levels=c(0,1),labels=c("No","Si")),
                          history=factor(history,levels=c("good","poor","terrible"),labels=c("buena","mala","terrible")),
                          foreign=factor(foreign,levels=c("foreign","german"),labels=c("extranjero","aleman")),
                          purpose=factor(purpose,levels=c("newcar","usedcar","goods/repair","edu", "biz" ),labels=c("auto_nuevo","auto_usado","bienes","educacion","negocios")))         

credit <- credit  %>% mutate(Default=relevel(Default,ref="Si"))

In [None]:
head(credit)

### División de la muestra

- El objetivo es predecir bien fuera de muestra

- No queremos sobreajustar
  

In [None]:
set.seed(1011)
inTrain <- createDataPartition(
  y = credit$Default,## La variable dependiente u objetivo 
  p = .7, ## Usamos 70%  de los datos en el conjunto de entrenamiento 
  list = FALSE)


train <- credit[ inTrain,]
test  <- credit[-inTrain,]

In [None]:
arbol <- rpart(Default~duration+amount+installment+age+
                       history+purpose+foreign+rent, 
                        data    = train,
                       method = "class")

arbol

In [None]:
p_load(rpart.plot)
prp(arbol, under = TRUE, branch.lty = 2, yesno = 2, faclen = 0, varlen=15,box.palette = "-RdYlGn")

In [None]:
p_load(Metrics)

default<- ifelse(test$Default=="Si",1,0) #númerico

pred_prob <- predict(arbol, newdata = test, type = "prob")    ## Prob predicha

aucval_arbol <- Metrics::auc(actual = default,predicted = pred_prob[,2])

aucval_arbol

## Coding intuition bagging

## Bosques ``from scratch"



In [None]:
### Bootstraped samples
set.seed(1011)

B<-50

modelo   <- list()
pred <- list()
aucval <- vector()

for(i in 1:B){
        
    db_sample<- sample_frac(train,size=1,replace=TRUE) #takes a sample with replacement of the same size of the original sample (1 or 100%)
        
    modelo[[i]] <- rpart(Default~duration+amount+installment+age+
                       history+purpose+foreign+rent, 
                        data    =       db_sample,
                       method = "class")
    pred[[i]] <- predict(modelo[[i]], newdata = test, type = "prob")    ## Prob predicha

    aucval[i] <- Metrics::auc(actual = default,predicted = pred[[i]][,2])

    }

In [None]:
boxplot(aucval)

In [None]:
mean(aucval)

In [None]:
p_load(ipred)
set.seed(1011)

bagged_tree<- bagging(Default~duration+amount+installment+age+
                       history+purpose+foreign+rent, 
                        data    = train, nbagg = 50)

bagged_pred <- predict(bagged_tree,
                       newdata = test, type="prob")

In [None]:
aucval_ipred <- Metrics::auc(actual = default,predicted =bagged_pred[,2])
aucval_ipred

# Random Forstests

In [None]:
fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))
ctrl<- trainControl(method = "cv",
                     number = 5,
                     summaryFunction = fiveStats,
                     classProbs = TRUE,
                     verbose=FALSE,
                     savePredictions = T)


In [None]:
p_load(randomForest)



forest <- train(Default~duration+amount+installment+age+
                       history+purpose+foreign+rent, 
  data = train, 
  method = "rf",
  trControl = ctrl,
  metric="Sens",
)

In [None]:
forest

## Hiperparámetros



https://topepo.github.io/caret/train-models-by-tag.html#random-forest

In [None]:
sqrt(ncol(train))

In [None]:
mtry_grid <- expand.grid(mtry = seq(1, ncol(train), 2))
mtry_grid

In [None]:

bosque <- train(Default~duration+amount+installment+age+
                       history+purpose+foreign+rent, 
  data = train, 
  method = "rf",
  trControl = ctrl,
  metric="Sens",
  tuneGrid = mtry_grid,
  ntree=10)
bosque

In [None]:
plot(bosque)

In [None]:
bosque$finalModel

In [None]:
bosque_pred <- predict(bosque, newdata = test, type="raw")
confusionMatrix(data = bosque_pred, reference = test$Default)

### Variable Importance



In [None]:
varImp(forest,scale=TRUE)

# Boosted Trees
## ADA boosting

<div>
<img src="figures/adaboost.png" width="800"/>
</div>

In [None]:
#https://topepo.github.io/caret/train-models-by-tag.html#boosting
p_load(fastAdaboost)
set.seed(1410)


M_grid<- expand.grid(nIter=c(10,50,100),method="adaboost")
M_grid


In [None]:
adaboost_res <- train(Default~duration+amount+installment+age+
                       history+purpose+foreign+rent,
  data = train, 
  method = "adaboost", 
  trControl = ctrl,
  metric = "Sens",
  tuneGrid = M_grid
                  

)

                    

adaboost_res


In [None]:
pred_ada<-predict(adaboost,test)
confusionMatrix(test$Default,pred_ada)

# Traditional GBM

<div>
<img src="figures/boosted-trees-process.png" width="1000"/>
</div>

In [None]:
p_load(gbm)


gbm_res <- train(Default~duration+amount+installment+age+
                       history+purpose+foreign+rent,
  data = train, 
  method = "gbm", 
  trControl = ctrl,
#  family = "binomial", 
  metric = "Sens"
                  

)            

gbm_res
