# Lab 1: Running parallel R
### Objective:
Learn how to run parallel R
### Successful outcome:
Investigate resource pressures between serial and parallel approaches.

## Step 1: Install packages 

In [None]:
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg)) 
    install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}

# usage
packages <- c("snow", "foreach", "doSNOW",  "parallel", "boot")
ipak(packages)

## Step 2: Define a simple R function

In [None]:
myProc <- function(size=100000) {
#Load a large vector
vec <- rnorm(size)
#Now sleep on it
Sys.sleep(2)
#Now sum the vec values
return(sum(vec))
}

## Step 3: Serial appraoch - apply function

In [None]:
ptm <- proc.time()
result <- sapply(1:10, function(i) myProc())
proc.time() - ptm

## Step 4: Parallel appraoch -- parallel package

In [None]:
require(parallel)
ptm <- proc.time()
result <- mclapply(1:10, function(i) myProc(), mc.cores=10)
proc.time() - ptm

## Step 5: Parallel appraoch -- snow package

In [None]:
require(snow)
hostnames <- rep('localhost', 10)
cluster <- makeSOCKcluster(hostnames)
clusterExport(cluster, list('myProc'))
ptm <- proc.time()
result <- clusterApply(cluster, 1:10, function(i) myProc())
proc.time() - ptm
stopCluster(cluster)

## Step 6: Parallel appraoch -- foreach + snow package

In [None]:
## Loading required package: doSNOW
require(foreach)
require(doSNOW)

hostnames <- rep('localhost', 10)
cluster <- makeSOCKcluster(hostnames)
registerDoSNOW(cluster)
ptm <- proc.time()
result <- foreach(i=1:10, .combine=c) %dopar% {
myProc()
}
proc.time() - ptm
stopCluster(cluster)

In [None]:
help(foreach)

## Step 7: Bootstrap calculations based on serial implementation

In [None]:
#dataset
random.data <- matrix(rnorm(1000000), ncol = 1000)
#calculate a median
bmed <- function(d, n) median(d[n])
library(boot)
ptm <- proc.time()
sapply(1:100, function(n) {sd(boot(random.data[, n], bmed, R = 2000)$t)})
proc.time() - ptm 

## Step 8.1: Bootstrap calculations based on parallel implementation

In [None]:
random.data <- matrix(rnorm(1000000), ncol = 1000)
bmed <- function(d, n) median(d[n])
library(boot)
cluster = makeCluster(10, type = "SOCK")
registerDoSNOW(cluster)
clusterExport(cluster, c("random.data", "bmed"))
ptm <- proc.time()    
results = foreach(n = 1:100, .combine = c) %dopar% {
     library(boot); 
     #Define function
     sd(boot(random.data[, n], bmed, R = 2000)$t)
}
results
proc.time() - ptm 
stopCluster(cluster)

## Q1: Combine vectors (the results) into a matrix 

In [None]:
random.data <- matrix(rnorm(1000000), ncol = 1000)
bmed <- function(d, n) median(d[n])
library(boot)
cluster = makeCluster(10, type = "SOCK")
registerDoSNOW(cluster)
clusterExport(cluster, c("random.data", "bmed"))
ptm <- proc.time()  

#Solution: change .combine argument 
#Specifying 'c' is useful for concatenating the results into a vector
results = foreach(n = 1:100, .combine = c) %dopar% {
    library(boot); 
    #Define function 
    sd(boot(random.data[, n], bmed, R = 2000)$t)
}
results
proc.time() - ptm 
stopCluster(cluster)

#Calculate mean of results
summary_results <- as.data.frame(mean(results))
colnames(summary_results) <- "Mean"
summary_results