In [1]:
suppressPackageStartupMessages({
    library(foreign) # reading stata dataset
    library(dplyr)# table manipulation
    })
# read the TIV data
mydata <- read.dta("comparison_TIV_all_resp_2.dta")
mydata <- filter(mydata,!is.na(man)) %>% select(man,fs5,spm12_mwc)
head(mydata)

Unnamed: 0_level_0,man,fs5,spm12_mwc
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
3,1345.541,1497.891,1353.598
4,1381.081,1436.819,1359.308
5,1285.623,1268.764,1265.084
6,1574.595,1680.504,1557.601
7,1182.91,1172.865,1170.121
8,1408.842,1500.151,1391.574


In [2]:
model.mwc <- lm(spm12_mwc~man,mydata) # linear model
print(model.mwc)
sprintf("Rsquared=%f",summary(model.mwc)$r.squared) # Don't want all of summary
model.fs5 <- lm(fs5~man, mydata)
print(model.fs5)
sprintf("Rsquared=%f",summary(model.fs5)$r.squared)


Call:
lm(formula = spm12_mwc ~ man, data = mydata)

Coefficients:
(Intercept)          man  
     1.5103       0.9707  




Call:
lm(formula = fs5 ~ man, data = mydata)

Coefficients:
(Intercept)          man  
   -345.260        1.289  



In [3]:
mycleandata <- filter(mydata, fs5<3000)

In [4]:
library(broom)
library(dplyr)
library(purrr)
library(tibble)
library(rsample)
library(boot)

In [5]:
# required arguments of boot: function (data, statistic, R)
# data : data frame to sample from
# statistic : function that calculates the statistic or statistics to bootstrap
# R : number of repetitions

# Step 1: define a function that returns our statistic of interest:
# the difference in R values. It will take a data frame and sampling
# indices:
diffR2est <- function(thisdata, ...){
    model.mwc <- lm(spm12_mwc~man,analysis(thisdata))
    model.fs5 <- lm(fs5~man, analysis(thisdata))
    # By default R functions return the last value:
    tibble(
        term="r2diff",
        estimate = (summary(model.mwc)$r.squared - summary(model.fs5)$r.squared),
        std.err = NA_real_
        )
}
diffR2 <- function(thisdata, set){
    model.mwc <- lm(spm12_mwc~man,thisdata[set,])
    model.fs5 <- lm(fs5~man, thisdata[set,])
    # By default R functions return the last value:
    summary(model.mwc)$r.squared - summary(model.fs5)$r.squared
}

In [6]:
system.time(r2_rs <- bootstraps(mycleandata, 10000, apparent=TRUE) %>% mutate(results=map(splits,diffR2est)))
system.time(r2_boot <- boot(mycleandata, diffR2, 10000))

   user  system elapsed 
 43.248   0.041  43.635 

   user  system elapsed 
 26.497   0.007  26.726 

In [7]:
system.time(r2_bca<-int_bca(r2_rs,results, .fn=diffR2est))
r2_bca

   user  system elapsed 
  1.691   0.003   1.709 

term,.lower,.estimate,.upper,.alpha,.method
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
r2diff,0.1022015,0.1395397,0.1945306,0.05,BCa


In [8]:
system.time(mydataci <- boot.ci(r2_boot,type="bca"))
mydataci

   user  system elapsed 
  2.364   0.079   2.471 

BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates

CALL : 
boot.ci(boot.out = r2_boot, type = "bca")

Intervals : 
Level       BCa          
95%   ( 0.1016,  0.1939 )  
Calculations and Intervals on Original Scale

In [9]:
# Risk ratio :
armsize <- 14000
inf.vacc <- 8
inf.plac <- 86
rr <- (inf.vacc/armsize) / (inf.plac/armsize)
effic <- 1-rr
print(paste(100*c(rr, effic)))

[1] "9.30232558139535" "90.6976744186046"


In [10]:
smallarm <- 1000


In [11]:
vacctrial <- data.frame(outcome=c(rep(1,inf.plac),rep(0,smallarm-inf.plac),rep(1,inf.vacc),rep(0,smallarm-inf.vacc)), group=c(rep(0,smallarm),rep(1,smallarm)))
head(vacctrial)
tail(vacctrial)
table(vacctrial)

outcome,group
<dbl>,<dbl>
1,0
1,0
1,0
1,0
1,0
1,0


Unnamed: 0_level_0,outcome,group
Unnamed: 0_level_1,<dbl>,<dbl>
1995,0,1
1996,0,1
1997,0,1
1998,0,1
1999,0,1
2000,0,1


       group
outcome   0   1
      0 914 992
      1  86   8

In [12]:
effrate <- function(d,set) {
   samp <- d[set,]
   place <- samp[samp$group==0,]
   treat <- samp[samp$group==1,]
   effrate <- 1 - (sum(treat$outcome)/nrow(treat))/(sum(place$outcome)/nrow(place))
   effrate
 }
effrate(vacctrial)

In [13]:
effrate_est <- function(d,...){
   samp <- analysis(d)
   place <- samp[samp$group==0,]
   treat <- samp[samp$group==1,]
   effrate <- 1 - (sum(treat$outcome)/nrow(treat))/(sum(place$outcome)/nrow(place))
   tibble(
       term="eff",
       estimate=effrate,
       std.err=NA_real_
       )
}

Using `boot` (single threaded):

In [14]:
system.time(boot.risk.10000 <- boot(vacctrial,effrate, 10000, strata=vacctrial$group))

   user  system elapsed 
 16.107   0.132  16.355 

Using `rsample` (single threaded) is somewhat slower:

In [15]:
system.time(effrate_rs <- bootstraps(vacctrial, 10000, apparent=TRUE) %>% mutate(results=map(splits,effrate_est)))

   user  system elapsed 
 31.486   0.032  31.727 

By default `boot` uses bootstrap replicates regression for BCa calculations. For large sample sizes this is quite slow and memory hungry.

In [16]:
system.time(vacctrial_ci <- boot.ci(boot.risk.10000,type="bca"))
vacctrial_ci

   user  system elapsed 
 68.065   0.871  69.853 

BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates

CALL : 
boot.ci(boot.out = boot.risk.10000, type = "bca")

Intervals : 
Level       BCa          
95%   ( 0.8095,  0.9579 )  
Calculations and Intervals on Original Scale

In `rsample` BCa uses Jacknife for BCa calculations, which in this case is faster:

In [17]:
system.time(eff_bca<-int_bca(effrate_rs,results, .fn=effrate_est))
eff_bca

   user  system elapsed 
  5.151   0.053   5.245 

term,.lower,.estimate,.upper,.alpha,.method
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
eff,0.8133908,0.9062049,0.9595117,0.05,BCa


But `boot` can also use Jacknife for the influence estimates:

In [18]:
system.time(vacctrial_ci_jack <- boot.ci(boot.risk.10000,type="bca",L=empinf(boot.risk.10000,type="jack")))
vacctrial_ci_jack

   user  system elapsed 
  0.666   0.051   0.722 

BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates

CALL : 
boot.ci(boot.out = boot.risk.10000, type = "bca", L = empinf(boot.risk.10000, 
    type = "jack"))

Intervals : 
Level       BCa          
95%   ( 0.8095,  0.9579 )  
Calculations and Intervals on Original Scale

In [18]:
vacctrialbig <- data.frame(outcome=c(rep(1,inf.plac),rep(0,armsize-inf.plac),rep(1,inf.vacc),rep(0,armsize-inf.vacc)), group=c(rep(0,armsize),rep(1,armsize)))
table(vacctrialbig)

       group
outcome     0     1
      0 13914 13992
      1    86     8

In [19]:
system.time(effratebig_rs <- bootstraps(vacctrialbig, 10000, apparent=TRUE) %>% mutate(results=map(splits,effrate_est)))

   user  system elapsed 
308.280   0.955 310.817 

   user  system elapsed 
333.330   1.014 337.286 

In [20]:
## memory hungry
#system.time(effbig_bca<-int_bca(effratebig_rs,results, .fn=effrate_est))
#effbig_bca

In [21]:
library(furrr)

Loading required package: future



In [22]:
plan(multisession, workers=2)
system.time(effrate_rs_par <- bootstraps(vacctrial, 10000, apparent=TRUE) %>% mutate(results=future_map(splits,effrate_est)))

   user  system elapsed 
 10.600   0.292  31.137 

In [23]:
system.time(eff_par_bca<-int_bca(effrate_rs_par,results, .fn=effrate_est))
eff_par_bca

   user  system elapsed 
  2.172   0.050   4.478 

term,.lower,.estimate,.upper,.alpha,.method
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
eff,0.8150653,0.9056931,0.9579232,0.05,BCa


In [None]:
## Might kill things
#plan(multisession, workers=2)
#system.time(effratebig_rs_par <- bootstraps(vacctrialbig, 10000, apparent=TRUE) %>% mutate(results=future_map(splits,effrate_est)))

In [None]:
#system.time(effbig_par_bca<-int_bca(effratebig_rs_par,results, .fn=effrate_est))
#effbig_bca