# 1 Random Forests

### 1.1 Predicting Hospital Readmission

Using the `diabetes_data_clean` data set from previous examples, suppose we want to use random forests to build a model for predicting whether or not a patient will be later re-admitted

In [None]:
library(ranger)
library(randomForest)
library(tidyverse)

titanic_raw <- read.csv("diabetes_data_clean.csv")

In [None]:
# exclude identifier variables

titanic <- titanic_raw %>%
             select(-c(encounter_id, patient_nbr, weight, medical_specialty, payer_code, diag_1, diag_2, diag_3)) %>%
             filter(complete.cases(.))



### 1.2 Specify Subsets for Training and Test 

In [None]:
n_total <- nrow(titanic)
n_train <- floor(0.80 * n_total)

train <- sample(1:n_total, n_train)
test <- setdiff(1:n_total, train)

In [None]:
titanic$readmitted <- titanic$readmitted != "NO"

### 1.3 Fit Random Forest

In [None]:
fm1 <- ranger(factor(readmitted) ~ ., 
              data = titanic[train, ], 
              importance = "impurity", 
              num.tree = 1000,
              mtry = 10,
              min.node.size = 20)

In [None]:
show(fm1)

In [None]:
yhat <- predict(fm1, titanic[test, ])
mean(titanic[test, "readmitted"] == yhat$predictions)   # test error 

In [None]:
# Plotting variable importance

imp <- sort(ranger::importance(fm1), decreasing = TRUE)
imp_vals <- data.frame(gini = imp)
imp_vals$variable <- factor(names(imp), levels = names(imp))

ggplot(imp_vals[1:30, ], aes(x = gini, y = variable)) +
    geom_point(colour = "blue")

In [None]:
fm2 <- randomForest(factor(readmitted) ~ ., 
                    data = titanic[train, ], 
                    importance = TRUE,
                    ntree = 500)

In [None]:
show(fm2)

In [None]:
yhat <- predict(fm2, titanic[test, ])
mean(titanic[test, "readmitted"] == yhat)

---

---

# 2 Speed Benchmark in R

### 2.1 Truncating Near-Zero Values

In [None]:
trunc_near_zeros <- function(A, thresh) {
    n <- nrow(A)
    p <- ncol(A)
    
    for (j in 1:p) {
        for (i in 1:n) {
            if (abs(A[i, j]) < thresh) {
                A[i, j] <- 0.0
            }
        }
    }
    return(A)
}

In [None]:
n <- 1000000
p <- 100
X <- matrix(rnorm(n * p), ncol = p) 

In [None]:
system.time(expr = {
    X2 <- trunc_near_zeros(X, 0.001)
})

---

---

---

---

### 2.2 More Idiomatic (and faster) Version

In [None]:
trunc_near_zeros2 <- function(dat, thresh) {
    p <- ncol(dat)
    for (j in 1:p) {
        dat[, j] <- ifelse(abs(dat[, j]) < thresh, 0.0, dat[, j])
    }
    return(dat)
}

In [None]:
n <- 1000000
p <- 100
X <- matrix(rnorm(n * p), ncol = p) 

system.time(expr = {
    X2 <- trunc_near_zeros2(df1, 0.01)
})

### 2.3 Even Faster Version

In [None]:
trunc_near_zeros3 <- function(dat, thresh) {
    dat[abs(dat) < thresh] <- 0.0 
    return(dat)
}

In [None]:
n <- 1000000
p <- 100
X <- matrix(rnorm(n * p), ncol = p) 

system.time(expr = {
    X3 <- trunc_near_zeros3(X, 0.001)
})