diff --git a/.gitignore b/.gitignore index a899df2..869c0d6 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ src/*.dll src/Makevars tests/testthat/*bin tests/testthat/*desc +*.bin +*.desc diff --git a/DESCRIPTION b/DESCRIPTION index ebed6e2..f77867a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,3 +18,5 @@ License: LGPL-3 | Apache License 2.0 URL: http://www.bigmemory.org LazyLoad: yes Biarch: yes +VignetteBuilder: knitr +Suggests: knitr, testthat diff --git a/R/RcppExports.R b/R/RcppExports.R index 1505a88..549a9de 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -117,6 +117,9 @@ GetTypeString <- function(bigMatAddr) { .Call('bigmemory_GetTypeString', PACKAGE = 'bigmemory', bigMatAddr) } +#' @title big.matrix size +#' @description Returns the size of the created matrix in bytes +#' @param bigMat a \code{big.matrix} object #' @export GetMatrixSize <- function(bigMat) { .Call('bigmemory_GetMatrixSize', PACKAGE = 'bigmemory', bigMat) diff --git a/R/bigmemory.R b/R/bigmemory.R index 290ce35..de20f06 100644 --- a/R/bigmemory.R +++ b/R/bigmemory.R @@ -112,7 +112,7 @@ filebacked.big.matrix <- function(nrow, ncol, init=NULL, dimnames=NULL, separated=FALSE, backingfile=NULL, backingpath=NULL, descriptorfile=NULL, binarydescriptor=FALSE) -{ +{ if (nrow < 1 | ncol < 1) stop('A big.matrix must have at least one row and one column') @@ -157,6 +157,12 @@ filebacked.big.matrix <- function(nrow, ncol, backingpath <- path.expand(backingpath) backingpath <- file.path(backingpath, '.') backingpath <- substr( backingpath, 1, nchar(backingpath)-1 ) + + if(file.exists(file.path(backingpath, backingfile))){ + stop("Backing file already exists! Either remove or specify + different backing file name") + } + address <- CreateFileBackedBigMatrix(as.character(backingfile), as.character(backingpath), as.double(nrow), as.double(ncol), as.character(colnames), as.character(rownames), as.integer(typeVal), @@ -199,6 +205,10 @@ setGeneric('as.big.matrix', descriptorfile=NULL, binarydescriptor=FALSE, shared=TRUE) standardGeneric('as.big.matrix')) +#' @title Convert to base R matrix +#' @description Extract values from a \code{big.matrix} object +#' and convert to a base R matrix object +#' @param x A big.matrix object #' @export setMethod('as.matrix', signature(x='big.matrix'), function(x) return(x[,])) @@ -283,9 +293,11 @@ setMethod('as.big.matrix', signature(x='vector'), #' @export setGeneric('is.big.matrix', function(x) standardGeneric('is.big.matrix')) +#' @rdname big.matrix setMethod('is.big.matrix', signature(x='big.matrix'), function(x) return(TRUE)) +#' @rdname big.matrix setMethod('is.big.matrix', definition=function(x) return(FALSE)) @@ -341,18 +353,32 @@ assign('rownames.bm<-', return(x) }) +#' @title The Number of Rows/Columns of a big.matrix +#' @description \code{nrow} and \code{ncol} return the number of +#' rows or columns present in a \code{big.matrix} object. +#' @param x A big.matrix object +#' @return An integer of length 1 +#' @docType methods +#' @rdname ncol-methods #' @export setMethod('ncol', signature(x="big.matrix"), function(x) return(CGetNcol(x@address))) +#' @rdname ncol-methods #' @export setMethod('nrow', signature(x="big.matrix"), function(x) return(CGetNrow(x@address))) +#' @title Dimensions of a big.matrix object +#' @description Retrieve the dimensions of a \code{big.matrix} object +#' @param x A \code{big.matrix} object #' @export setMethod('dim', signature(x="big.matrix"), function(x) return(c(nrow(x), ncol(x)))) +#' @title Length of a big.matrix object +#' @description Get the length of a \code{big.matrix} object +#' @param x A \code{big.matrix} object #' @export setMethod('length', signature(x="big.matrix"), function(x) return(prod(dim(x)))) @@ -470,51 +496,76 @@ GetAll.bm <- function(x, drop=TRUE) return(mat) } +#' @title Extract or Replace big.matrix elements +#' @name Extract,big.matrix +#' @param x A \code{big.matrix object} +#' @param i Indices specifying the rows +#' @param j Indices specifying the columns +#' @param drop Logical indication if reduce to minimum dimensions +#' @param value typically an array-like R object of similar class +#' @docType methods +#' @rdname extract-methods +#' @aliases [,big.matrix,ANY,ANY,missing-method #' @export setMethod("[", signature(x = "big.matrix", drop = "missing"), - function(x, i, j) return(GetElements.bm(x, i, j))) + function(x, i, j, drop) return(GetElements.bm(x, i, j))) + +#' @rdname extract-methods #' @export setMethod("[", signature(x = "big.matrix", drop = "logical"), function(x, i, j, drop) return(GetElements.bm(x, i, j, drop))) + +#' @rdname extract-methods #' @export setMethod("[", signature(x = "big.matrix", i="missing", drop = "missing"), - function(x, j) return(GetCols.bm(x, j))) + function(x, i, j, drop) return(GetCols.bm(x, j))) + +#' @rdname extract-methods #' @export setMethod("[", signature(x = "big.matrix", i="missing", drop = "logical"), - function(x, j, drop) return(GetCols.bm(x, j, drop))) + function(x, i, j, drop) return(GetCols.bm(x, j, drop))) + +#' @rdname extract-methods #' @export setMethod("[", signature(x = "big.matrix", j="missing", drop = "missing"), - function(x, i) return(GetRows.bm(x, i))) + function(x, i, j, drop) return(GetRows.bm(x, i))) + +#' @rdname extract-methods #' @export setMethod("[", signature(x = "big.matrix", j="missing", drop = "logical"), - function(x, i, drop) return(GetRows.bm(x, i, drop))) + function(x, i, j, drop) return(GetRows.bm(x, i, drop))) + +#' @rdname extract-methods #' @export setMethod("[", signature(x = "big.matrix", i="missing", j="missing", drop = "missing"), - function(x) return(GetAll.bm(x))) + function(x, i, j, drop) return(GetAll.bm(x))) + +#' @rdname extract-methods #' @export setMethod("[", signature(x = "big.matrix", i="missing", j="missing", drop = "logical"), - function(x, drop) return(GetAll.bm(x, drop))) + function(x, i, j, drop) return(GetAll.bm(x, drop))) # Function contributed by Peter Haverty at Genentech. +#' @rdname extract-methods #' @export setMethod('[', signature(x = "big.matrix",i="matrix",j="missing",drop="missing"), - function(x, i) return(GetIndivElements.bm(x, i))) + function(x, i, j, drop) return(GetIndivElements.bm(x, i))) SetElements.bm <- function(x, i, j, value) @@ -890,32 +941,41 @@ SetAll.bm <- function(x, value) return(x) } +#' @rdname extract-methods #' @export setMethod('[<-', signature(x = "big.matrix"), function(x, i, j, value) return(SetElements.bm(x, i, j, value))) +#' @rdname extract-methods #' @export setMethod('[<-', signature(x = "big.matrix", i="missing"), - function(x, j, value) return(SetCols.bm(x, j, value))) + function(x, i, j, value) return(SetCols.bm(x, j, value))) +#' @rdname extract-methods #' @export setMethod('[<-', signature(x = "big.matrix", j="missing"), - function(x, i, value) return(SetRows.bm(x, i, value))) + function(x, i, j, value) return(SetRows.bm(x, i, value))) +#' @rdname extract-methods #' @export setMethod('[<-', signature(x = "big.matrix", i="missing", j="missing"), - function(x, value) return(SetAll.bm(x, value))) + function(x, i, j, value) return(SetAll.bm(x, value))) # Function contributed by Peter Haverty at Genentech. +#' @rdname extract-methods #' @export setMethod('[<-', signature(x = "big.matrix",i="matrix",j="missing"), - function(x, i, value) return(SetIndivElements.bm(x, i, value))) + function(x, i, j, value) return(SetIndivElements.bm(x, i, value))) +#' @title The Type of a big.matrix Object +#' @description \code{typeof} returns the storage type of a +#' \code{big.matrix} object +#' @param x A \code{big.matrix} object #' @export setMethod('typeof', signature(x="big.matrix"), function(x) { @@ -926,11 +986,16 @@ setMethod('typeof', signature(x="big.matrix"), # Little function to test if a value is # the 'R' representation of float/single value +#' @title Check if Float +#' @param x An object to be evaluated if float #' @export setGeneric('is.float', function(x){ standardGeneric('is.float') }) +#' @title Is Float? +#' @description Check if R numeric value has float flag +#' @param x A numeric value setMethod('is.float', signature(x='numeric'), function(x){ if(is.null(attr(x, 'Csingle'))){ @@ -941,6 +1006,13 @@ setMethod('is.float', signature(x='numeric'), } }) +#' @title Return First or Last Part of a big.matrix Object +#' @description Returns the first or last parts of a \code{big.matrix} +#' object. +#' @param x A big.matrix object +#' @param n A single integer for the number of rows to return +#' @docType methods +#' @rdname head-methods #' @export setMethod('head', signature(x="big.matrix"), function(x, n = 6) { @@ -949,6 +1021,8 @@ setMethod('head', signature(x="big.matrix"), return(x[1:n,]) }) + +#' @rdname head-methods #' @export setMethod('tail', signature(x="big.matrix"), function(x, n = 6) { @@ -957,6 +1031,13 @@ setMethod('tail', signature(x="big.matrix"), return(x[(nrow(x)-n+1):nrow(x),]) }) +#' @title Print Values +#' @description \code{print} will print out the elements within +#' a \code{big.matrix} object. +#' @note By default, this will only return the \code{head} of a big.matrix +#' to prevent console overflow. If you trun off the bigmemory.print.warning +#' option then it will convert to a base R matrix and print all elements. +#' @param x A \code{big.matrix} object #' @export setMethod('print', signature(x='big.matrix'), function(x) { @@ -1143,10 +1224,19 @@ mwhich.internal <- function(x, cols, vals, comps, op, whichFuncName) return(ret) } + +#' @title Dimnames of a big.matrix Object +#' @description Retrieve or set the dimnames of an object +#' @param x A big.matrix object +#' @param value A possible value for \code{dimnames(x)} +#' @docType methods +#' @rdname dimnames-methods #' @export setMethod('dimnames', signature(x = "big.matrix"), function(x) return(list(rownames.bm(x), colnames.bm(x)))) + +#' @rdname dimnames-methods #' @export setMethod('dimnames<-', signature(x = "big.matrix", value='list'), function(x, value) { @@ -1167,6 +1257,7 @@ setGeneric('write.big.matrix', function(x, filename, row.names=FALSE, col.names=FALSE, sep=",") standardGeneric('write.big.matrix')) +#' @rdname write.big.matrix setMethod('write.big.matrix', signature(x='big.matrix',filename='character'), function(x, filename, row.names, col.names, sep) { @@ -1198,6 +1289,7 @@ setGeneric('read.big.matrix', shared=TRUE) standardGeneric('read.big.matrix')) +#' @rdname write.big.matrix setMethod('read.big.matrix', signature(filename='character'), function(filename, sep, header, col.names, row.names, has.row.names, ignore.row.names, type, skip, separated, backingfile, backingpath, @@ -1308,6 +1400,7 @@ setMethod('read.big.matrix', signature(filename='character'), #' @export setGeneric('is.separated', function(x) standardGeneric('is.separated')) +#' @rdname big.matrix setMethod('is.separated', signature(x='big.matrix'), function(x) return(IsSeparated(x@address))) @@ -1399,7 +1492,7 @@ deepcopy <- function(x, cols=NULL, rows=NULL, setGeneric('is.sub.big.matrix', function(x) standardGeneric('is.sub.big.matrix')) - +#' @rdname sub.big.matrix setMethod('is.sub.big.matrix', signature(x='big.matrix'), function(x) return(CIsSubMatrix(x@address)) ) @@ -1412,6 +1505,8 @@ setMethod('is.sub.big.matrix', signature(x='big.matrix'), setGeneric('sub.big.matrix', function(x, firstRow=1, lastRow=NULL, firstCol=1, lastCol=NULL, backingpath=NULL) standardGeneric('sub.big.matrix')) + +#' @rdname sub.big.matrix setMethod('sub.big.matrix', signature(x='big.matrix'), function(x, firstRow, lastRow, firstCol, lastCol, backingpath) { @@ -1420,6 +1515,12 @@ setMethod('sub.big.matrix', signature(x='big.matrix'), }) #' @rdname big.matrix.descriptor-class +#' @param x A descriptor object +#' @param firstRow the first row of the submatrix +#' @param lastRow the last row of the submatrix if not NULL +#' @param firstCol the first column of the submatrix +#' @param lastCol of the submatrix if not NULL +#' @param backingpath required path to the filebacked object, if applicable setMethod('sub.big.matrix', signature(x='big.matrix.descriptor'), function( x, firstRow, lastRow, firstCol, lastCol, backingpath) { @@ -1494,6 +1595,10 @@ attach.big.matrix = function(obj, ...) } #' @rdname big.matrix.descriptor-class +#' @param obj The filename of the descriptor for a filebacked matrix, +#' assumed ot be in the directory specified +#' @param ... possibly \code{path} which gives the path where the descriptor +#' and/or filebacking can be found. #' @export setMethod('attach.resource', signature(obj='character'), function(obj, ...) @@ -1613,6 +1718,7 @@ setMethod('attach.resource', signature(obj='big.matrix.descriptor'), #' @export setGeneric('is.filebacked', function(x) standardGeneric('is.filebacked')) +#' @rdname big.matrix setMethod('is.filebacked', signature(x='big.matrix'), function(x) return(IsFileBackedBigMatrix(x@address))) @@ -1620,6 +1726,7 @@ setMethod('is.filebacked', signature(x='big.matrix'), #' @export setGeneric('shared.name', function(x) standardGeneric('shared.name')) +#' @rdname big.matrix setMethod('shared.name', signature(x='big.matrix'), function(x) return(SharedName(x@address))) @@ -1627,6 +1734,7 @@ setMethod('shared.name', signature(x='big.matrix'), #' @export setGeneric('file.name', function(x) standardGeneric('file.name')) +#' @rdname big.matrix setMethod('file.name', signature(x='big.matrix'), function(x) { @@ -1658,6 +1766,7 @@ t.big.matrix <- function(x, backingfile=NULL, #' @export setGeneric('flush', function(con) standardGeneric('flush')) +#' @rdname flush-methods setMethod('flush', signature(con='big.matrix'), function(con) { @@ -1674,6 +1783,7 @@ setMethod('flush', signature(con='big.matrix'), #' @export setGeneric('is.shared', function(x) standardGeneric('is.shared')) +#' @rdname big.matrix setMethod('is.shared', signature(x='big.matrix'), function(x) return(IsShared(x@address))) @@ -1764,6 +1874,7 @@ mpermute <- function(x, order=NULL, cols=NULL, allow.duplicates=FALSE, ...) #' @export setGeneric('is.readonly', function(x) standardGeneric('is.readonly')) +#' @rdname big.matrix setMethod('is.readonly', signature(x='big.matrix'), function(x) IsReadOnly(x@address)) diff --git a/README.md b/README.md index 04deb1e..48e2866 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![Build Status](https://travis-ci.org/kaneplusplus/bigmemory.png)](https://travis-ci.org/kaneplusplus/bigmemory) - diff --git a/examples/attach.big.matrix_examples.R b/examples/attach.big.matrix_examples.R deleted file mode 100644 index 6d9e528..0000000 --- a/examples/attach.big.matrix_examples.R +++ /dev/null @@ -1,20 +0,0 @@ -# The example is quite silly, as you wouldn't likely do this in a -# single R session. But if zdescription were passed to another R session -# via SNOW, foreach, or even by a simple file read/write, -# then the attach of the second R process would give access to the -# same object in memory. Please see the package vignette for real examples. -z <- big.matrix(3, 3, type='integer', init=3) -z[,] -dim(z) -z[1,1] <- 2 -z[,] -zdescription <- describe(z) -zdescription -y <- attach.big.matrix(zdescription) -y[,] -y -z -zz <- attach.resource(zdescription) -zz[1,1] <- -100 -y[,] -z[,] diff --git a/examples/core_examples.R b/examples/core_examples.R deleted file mode 100644 index 24ba421..0000000 --- a/examples/core_examples.R +++ /dev/null @@ -1,51 +0,0 @@ -x <- big.matrix(10, 2, type='integer', init=-5) -options(bigmemory.allow.dimnames=TRUE) -colnames(x) <- c("alpha", "beta") -is.big.matrix(x) -dim(x) -colnames(x) -rownames(x) -x[,] -x[1:8,1] <- 11:18 -colnames(x) <- NULL -x[,] - -# The following shared memory example is quite silly, as you wouldn't -# likely do this in a single R session. But if zdescription were -# passed to another R session via SNOW, foreach, or even by a -# simple file read/write, then the attach.big.matrix() within the -# second R process would give access to the same object in memory. -# Please see the package vignette for real examples. - -z <- big.matrix(3, 3, type='integer', init=3) -z[,] -dim(z) -z[1,1] <- 2 -z[,] -zdescription <- describe(z) -zdescription -y <- attach.big.matrix(zdescription) -y[,] -y -z -y[1,1] <- -100 -y[,] -z[,] - -# A short filebacked example, showing the creation of associated files: -files <- dir() -files[grep("example.bin", files)] -z <- filebacked.big.matrix(3, 3, type='integer', init=123, - backingfile="example.bin", - descriptorfile="example.desc", - dimnames=list(c('a','b','c'), c('d', 'e', 'f'))) -z[,] -files <- dir() -files[grep("example.bin", files)] -zz <- attach.big.matrix("example.desc") -zz[,] -zz[1,1] <- 0 -zzz <- attach.big.matrix(describe(z)) -zzz[,] - -is.nil(z@address) diff --git a/examples/morder_examples.R b/examples/morder_examples.R deleted file mode 100644 index 13166aa..0000000 --- a/examples/morder_examples.R +++ /dev/null @@ -1,7 +0,0 @@ -m = matrix(as.double(as.matrix(iris)), nrow=nrow(iris)) -morder(m, 1) -order(m[,1]) - -m[order(m[,1]), 2] -mpermute(m, cols=1) -m[,2] diff --git a/examples/mwhich_examples.R b/examples/mwhich_examples.R deleted file mode 100644 index 0d7fba3..0000000 --- a/examples/mwhich_examples.R +++ /dev/null @@ -1,38 +0,0 @@ -x <- as.big.matrix(matrix(1:30, 10, 3)) -options(bigmemory.allow.dimnames=TRUE) -colnames(x) <- c("A", "B", "C") -x[,] -x[mwhich(x, 1:2, list(c(2,3), c(11,17)), - list(c('ge','le'), c('gt', 'lt')), 'OR'),] - -x[mwhich(x, c("A","B"), list(c(2,3), c(11,17)), - list(c('ge','le'), c('gt', 'lt')), 'AND'),] - -# These should produce the same answer with a regular matrix: -y <- matrix(1:30, 10, 3) -y[mwhich(y, 1:2, list(c(2,3), c(11,17)), - list(c('ge','le'), c('gt', 'lt')), 'OR'),] - -y[mwhich(y, -3, list(c(2,3), c(11,17)), - list(c('ge','le'), c('gt', 'lt')), 'AND'),] - - -x[1,1] <- NA -mwhich(x, 1:2, NA, 'eq', 'OR') -mwhich(x, 1:2, NA, 'neq', 'AND') - -# Column 1 equal to 4 and/or column 2 less than or equal to 16: -mwhich(x, 1:2, list(4, 16), list('eq', 'le'), 'OR') -mwhich(x, 1:2, list(4, 16), list('eq', 'le'), 'AND') - -# Column 2 less than or equal to 15: -mwhich(x, 2, 15, 'le') - -# No NAs in either column, and column 2 strictly less than 15: -mwhich(x, c(1:2,2), list(NA, NA, 15), list('neq', 'neq', 'lt'), 'AND') - -x <- big.matrix(4, 2, init=1, type="double") -x[1,1] <- Inf -mwhich(x, 1, Inf, 'eq') -mwhich(x, 1, 1, 'gt') -mwhich(x, 1, 1, 'le') diff --git a/examples/write.big.matrix_examples.R b/examples/write.big.matrix_examples.R deleted file mode 100644 index 9a9b469..0000000 --- a/examples/write.big.matrix_examples.R +++ /dev/null @@ -1,33 +0,0 @@ -# Without specifying the type, this big.matrix x will hold integers. -x <- as.big.matrix(matrix(1:10, 5, 2)) -x[2,2] <- NA -x[,] -write.big.matrix(x, "foo.txt") - -# Just for fun, I'll read it back in as character (1-byte integers): -y <- read.big.matrix("foo.txt", type="char") -y[,] - -# Other examples: -w <- as.big.matrix(matrix(1:10, 5, 2), type='double') -w[1,2] <- NA -w[2,2] <- -Inf -w[3,2] <- Inf -w[4,2] <- NaN -w[,] -write.big.matrix(w, "bar.txt") -w <- read.big.matrix("bar.txt", type="double") -w[,] -w <- read.big.matrix("bar.txt", type="short") -w[,] - -# Another example using row names (which we don't like). -x <- as.big.matrix(as.matrix(iris), type='double') -rownames(x) <- as.character(1:nrow(x)) -head(x) -write.big.matrix(x, 'IrisData.txt', col.names=TRUE, row.names=TRUE) -y <- read.big.matrix("IrisData.txt", header=TRUE, has.row.names=TRUE) -head(y) - -# The following would fail with a dimension mismatch: -if (FALSE) y <- read.big.matrix("IrisData.txt", header=TRUE) diff --git a/inst/doc/Overview.R b/inst/doc/Overview.R new file mode 100644 index 0000000..4ca182d --- /dev/null +++ b/inst/doc/Overview.R @@ -0,0 +1,78 @@ +## ----include=FALSE------------------------------------------------------- +library(knitr) +opts_chunk$set( +fig.path='graphics/stat' +) + +## ----setup,include=FALSE,echo=FALSE-------------------------------------- +options(keep.source = TRUE, width = 75) + +## ----read, eval = FALSE-------------------------------------------------- +# library(bigmemory) +# library(biganalytics) +# x <- read.big.matrix("airline.csv", type="integer", header=TRUE, +# backingfile="airline.bin", +# descriptorfile="airline.desc", +# extraCols="Age") +# summary(x) +# +# # min max mean NA's +# #Year 1987 2008 1998.62 0 +# #Month 1 12 6.55 0 +# #DayofMonth 1 31 15.72 0 +# #DayOfWeek 1 7 3.94 0 +# #ArrDelay -1437 2598 7.05 2587529 +# #DepDelay -1410 2601 8.17 2302136 +# #... (other variables omitted here) ... + +## ----birth, eval = FALSE------------------------------------------------- +# birthmonth <- function(y) { +# minYear <- min(y[,'Year'], na.rm=TRUE) +# these <- which(y[,'Year']==minYear) +# minMonth <- min(y[these,'Month'], na.rm=TRUE) +# return(12*minYear + minMonth - 1) +# } + +## ----run_birth, eval = FALSE--------------------------------------------- +# allplanes <- unique(x[,'TailNum']) +# planeStart <- rep(0, length(allplanes)) +# for (i in allplanes) { +# planeStart[i] <- birthmonth( x[mwhich(x, 'TailNum', i, 'eq'), +# c('Year', 'Month'), drop=FALSE] ) +# } + +## ----bigsplit, eval = FALSE---------------------------------------------- +# library(bigtabulate) +# planeindices <- bigsplit(x, 'TailNum') + +## ----split, eval = FALSE------------------------------------------------- +# planeindices <- split(1:nrow(x), x[,'TailNum']) + +## ----sapply, eval = FALSE------------------------------------------------ +# planeStart <- sapply(planeindices, +# function(i) birthmonth(x[i, c('Year','Month'), +# drop=FALSE])) + +## ----parallel, eval = FALSE---------------------------------------------- +# library(doMC) +# registerDoMC(cores=2) +# planeStart <- foreach(i=planeindices, .combine=c) %dopar% { +# return(birthmonth(x[i, c('Year','Month'), drop=FALSE])) +# } + +## ----ages, eval = FALSE-------------------------------------------------- +# x[,'Age'] <- x[,'Year']*as.integer(12) + +# x[,'Month'] - as.integer(planeStart[x[,'TailNum']]) + +## ----biglm, eval = FALSE------------------------------------------------- +# blm <- biglm.big.matrix(ArrDelay ~ Age + Year, data=x) +# summary(blm) + +## ----biglm_output, eval = FALSE------------------------------------------ +# #Large data regression model: biglm(formula = formula, data = data, ...) +# #Sample size = 84216580 +# # Coef (95% CI) SE p +# #(Intercept) 91.6149 87.6509 95.5789 1.9820 0 +# #Age 0.0144 0.0142 0.0146 0.0001 0 +# #Year -0.0424 -0.0444 -0.0404 0.0010 0 + diff --git a/inst/doc/vignette/Overview.Rnw b/inst/doc/Overview.Rnw similarity index 81% rename from inst/doc/vignette/Overview.Rnw rename to inst/doc/Overview.Rnw index 0ce7b76..39ce880 100644 --- a/inst/doc/vignette/Overview.Rnw +++ b/inst/doc/Overview.Rnw @@ -1,418 +1,409 @@ -% \VignetteIndexEntry{The Bigmemory Project Overview} -% \VignetteDepends{bigmemory} -% \VignettePackage{bigmemory} -\documentclass[12pt]{article} - -\usepackage{graphics} -\usepackage{graphicx} -\usepackage{Sweave} -\usepackage{accents} - -% New from euler: -\usepackage{ae} -\usepackage{color} -\usepackage{url} - -\topmargin=-0.85in -\textheight=9.5in -\textwidth=6.5in -\oddsidemargin=0in -%-0.25in - -%\usepackage{CJK} -%\usepackage{pinyin} -\def\E{\mathord{I\kern-.35em E}} -\def\R{\mathord{I\kern-.35em R}} -\def\P{\mathord{I\kern-.35em P}} -\def\I{\mathord{1\kern-.35em 1}} -\def\wt{\mathord{\widehat{\theta}}} - -\newcommand{\proglang}[1]{\textbf{#1}} -\newcommand{\pkg}[1]{\texttt{\textsl{#1}}} -\newcommand{\code}[1]{\texttt{#1}} -\newcommand{\mg}[1]{{\textcolor {magenta} {#1}}} -\newcommand{\gr}[1]{{\textcolor {green} {#1}}} -\newcommand{\bl}[1]{{\textcolor {blue} {#1}}} - -\newtheorem{thm}{Theorem}[section] -\newtheorem{myexplore}[thm]{Explore} -\newtheorem{mybackground}[thm]{Background} -\newtheorem{myquestion}[thm]{Question} -\newtheorem{myexample}[thm]{Example} -\newtheorem{mydefinition}[thm]{Definition} -\newtheorem{mytheorem}[thm]{Theorem} - -%\pagestyle{myheadings} % Go for customized headings -%\markboth{notused left title}{John W. Emerson, Department of Statistics, Yale University \copyright 2009} -%\newcommand{\sekshun}[1] % In 'article' only the page -% { % number appears in the header. -% \section{#1} % I want the section name AND -% \markboth{#1 \hfill}{#1 \hfill} % the page, so I need a new kind -% } % of '\sekshun' command. - -\begin{document} - -\setkeys{Gin}{width=1.0\textwidth} -\SweaveOpts{prefix.string=graphics/stat} - -<>= -options(keep.source = TRUE, width = 75) -@ - -\begin{center} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -{\Large\bf The Bigmemory Project} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\vspace*{0.5cm} -{\bf Michael J. Kane and John W. Emerson\\ -Yale University\\ -April 29, 2010} - -\vspace*{0.25cm} - -\end{center} - -%\begin{raggedright} -\parindent=0.5in - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\begin{quotation} -Multi-gigabyte data sets often challenge and frustrate \proglang{R} -users. \proglang{C/C++} programming can provide efficiencies, -but is cumbersome for interactive data analysis and -lacks the flexibility and power of \proglang{R}'s rich statistical -programming environment. The package \pkg{bigmemory} and sister -packages \pkg{biganalytics}, \pkg{synchronicity}, \pkg{bigtabulate}, -and \pkg{bigalgebra} bridge this gap, implementing massive matrices -and supporting their manipulation and exploration. -The data structures may be allocated to shared memory, allowing separate -processes on the same computer to share access to a single copy of the -data set. The data structures may also be file-backed, allowing users -to easily manage and analyze data sets larger than available RAM and -share them across nodes of a cluster. -These features of the Bigmemory Project open the door for powerful and -memory-efficient parallel analyses and data mining of massive data sets, -even on modest hardware. -\end{quotation} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\vspace*{0.5cm} -\noindent -{\bf Introductory Example: the 2009 JSM Data Expo} -\vspace*{0.5cm} - -Consider the complete airline on-time performance data -from the 2009 JSM Data Expo. The processed data set, \texttt{airline.csv}, -is approximately 11 GB (about 120 million rows and 29 columns) -with factors coded as integers (see \url{http://www.bigmemory.org/} for -processing information). -The \texttt{read.big.matrix()} call creates the binary -file-backing \texttt{airline.bin} -associated with the \texttt{big.matrix} object \texttt{x}. -Subsequent \proglang{R} sessions can attach instantly to \texttt{airline.bin} -without incurring the one-time overhead (about 25 minutes) -associated with creating the backing. -A summary of the entire data set is easily obtained using the new -\texttt{summary()} method. Note the surprising -presence of negative arrival and departure delays: exploratory data analysis -in action via \pkg{bigmemory}. The summary -only takes 3-4 minutes to process the 11 GB of data on a laptop with only -4 GB of RAM. -\begin{Schunk} -\begin{Sinput} -> library(bigmemory) -> library(biganalytics) -> x <- read.big.matrix("airline.csv", type="integer", header=TRUE, -+ backingfile="airline.bin", -+ descriptorfile="airline.desc", -+ extraCols="Age") -> summary(x) - min max mean NA's -Year 1987 2008 1998.62 0 -Month 1 12 6.55 0 -DayofMonth 1 31 15.72 0 -DayOfWeek 1 7 3.94 0 -ArrDelay -1437 2598 7.05 2587529 -DepDelay -1410 2601 8.17 2302136 -... (other variables omitted here) ... -\end{Sinput} -\end{Schunk} - - -\noindent -{\bf Overview} -\vspace*{0.5cm} - -Data frames and matrices in \proglang{R} are easy to use, -with typical manipulations executing quickly on -data sets much smaller than available RAM. They suit the needs of many -\proglang{R} users and work seamlessly with existing \proglang{R} functions -and packages. However, problems arise with larger data sets and when -increased memory requirements of parallel programming strain the system. - -The Bigmemory Project offers packages -for two purposes. First, \pkg{bigmemory}, \pkg{biganalytics}, and -\pkg{bigtabulate} have been designed to provide a minimalist, -elegant framework for users to manage and explore large data sets, even -on modest hardware (expensive workstations or clusters are not required). -The interface is designed to mimic \proglang{R}'s familiar \code{matrix} -syntax. Matthew Keller, Assistant Professor of -Psychology, University of Colorado at Boulder offered the following -testimonial about \pkg{bigmemory}: ``I love that it's intuitive and -doesn't require a lot of learning new ways to code things.'' - -Second, the packages of the Bigmemory Project provide a foundation for -memory-efficient parallel programming and can serve as building blocks -for developers of new high-performance computing tools in \proglang{R}. -When used in conjunction with a parallel package (such as \pkg{foreach}, -\pkg{snow}, \pkg{Rmpi}, or \pkg{multicore}, for example), -even shared-memory parallel-computing becomes -accessible to non-experts. -The programming interface is stable, and offers the flexibility to support -the development of -algorithms working seamlessly on both \texttt{big.matrix} and traditional -\texttt{matrix} objects. For examples of this, look first -at the function \texttt{mwhich()}; it offers flexible \texttt{which()}-like -functionality that is computationally efficient and avoids memory overhead. -In addition, all the functions provided by \pkg{bigtabulate} may be used -with \texttt{matrix} and \texttt{big.matrix} objects alike. - -\vspace*{0.5cm} -\noindent -{\bf Underneath the Hood of the Bigmemory Project} -\vspace*{0.5cm} - -The packages of the Bigmemory Project use the Boost Interprocess -\proglang{C++} library to provide platform-independent support for -massive matrices that may be shared across \proglang{R} processes. -Innovative use of \proglang{C++} accessors supports matrices of -\texttt{double}, \texttt{integer}, \texttt{short}, and \texttt{char}, -as well as the development of algorithms working seamlessly on -\texttt{big.matrix} objects or traditional \proglang{R} matrices. - -\vspace*{0.5cm} -\noindent -{\bf Example: Airplane Ages and Parallel Processing} -\vspace*{0.5cm} - -We would like to approximate the age of each plane at the time of -each flight. This first requires calculation of an approximate -``birthmonth'' for each plane: the month of the first -appearance in the data set. Given a matrix -\texttt{y} containing \texttt{Year} and \code{Month} for all flights -of a given plane, \texttt{birthmonth(y)} returns the -month (in months AD) of the earliest flight: -\begin{Schunk} -\begin{Sinput} -> birthmonth <- function(y) { -+ minYear <- min(y[,'Year'], na.rm=TRUE) -+ these <- which(y[,'Year']==minYear) -+ minMonth <- min(y[these,'Month'], na.rm=TRUE) -+ return(12*minYear + minMonth - 1) -+ } -\end{Sinput} -\end{Schunk} -A traditional approach to calculating all the birthmonths might use a \code{for()} loop: -\begin{Schunk} -\begin{Sinput} -> allplanes <- unique(x[,'TailNum']) -> planeStart <- rep(0, length(allplanes)) -> for (i in allplanes) { -+ planeStart[i] <- birthmonth( x[mwhich(x, 'TailNum', i, 'eq'), -+ c('Year', 'Month'), drop=FALSE] ) -+ } -\end{Sinput} -\end{Schunk} -With about 13,000 flights this takes about 9 hours, even with the relative -fast and memory-efficient use of \texttt{mwhich()}. - -A far more efficient alternative is to first obtain a list of row indices -for each plane: -\begin{Schunk} -\begin{Sinput} -> library(bigtabulate) -> planeindices <- bigsplit(x, 'TailNum') -\end{Sinput} -\end{Schunk} -Here, the use of the new function \code{bigsplit()} is equivalent to -\begin{Schunk} -\begin{Sinput} -> planeindices <- split(1:nrow(x), x[,'TailNum']) -\end{Sinput} -\end{Schunk} -but is faster (16 versus 29 seconds) and more memory efficient (with -peak memory usage of 2 versus 3 GB). -Either way, -\texttt{planeindices[i]} contains all row indices corresponding to flights -with \texttt{TailNum} equal to \texttt{i}. This requires several hundred MB, -but is computationally more efficient in this problem. For example, -\texttt{planeindices} may be used with \code{sapply()} in the obvious way, -completing the task in a mere 30 seconds: -\begin{Schunk} -\begin{Sinput} -> planeStart <- sapply(planeindices, -+ function(i) birthmonth(x[i, c('Year','Month'), -+ drop=FALSE])) -\end{Sinput} -\end{Schunk} - -The looping structure \texttt{foreach()} of package \pkg{foreach} -can be a powerful and flexible alternative to \texttt{for()} or -functions like -\texttt{lapply()} and \texttt{sapply()}. It can also -take advantage of the shared-memory -capability of \pkg{bigmemory}. Package \pkg{doMC} provides one of several -available ``parallel backends'' for the function \texttt{foreach()}, allowing -the work to be automatically distributed to available processor cores: -\begin{Schunk} -\begin{Sinput} -> library(doMC) -> registerDoMC(cores=2) -> planeStart <- foreach(i=planeindices, .combine=c) %dopar% { -+ return(birthmonth(x[i, c('Year','Month'), drop=FALSE])) -+ } -\end{Sinput} -\end{Schunk} -The syntax of a \code{foreach()} loop is slightly different from the -syntax of a traditional loop, but its benefits are clear: -in this example, it takes only 14 seconds -to calculate the plane birthmonths using two processor cores.\footnote{We -should note that \pkg{doMC} and \pkg{multicore} are particularly well-suited -for this. When other parallel backends are used, one additional command is -required in the \code{birthmonth()} function: \code{x <- attach.big.matrix(xdesc)} -where \code{xdesc <- describe(x)} would be required just prior to the -\code{foreach()} loop, providing explicit shared-memory access across processes. -In contrast, \code{multicore} automatically operates on shared memory, -avoiding the need for this extra step.} -Both cores share access to the same master copy -of the airline data (with \texttt{Year} and \texttt{Month} cached in RAM); -individual calls to \texttt{birthmonth()} are relatively small in size. -Without the \texttt{registerDoMC()} -initialization, the \code{foreach()} loop would run on a single processor core, much -like \code{sapply()}, but taking about 24 seconds in this problem -with lower memory overhead than \code{sapply()}. - -Finally, the plane ages at the time of all flights may be calculated: -\begin{Schunk} -\begin{Sinput} -> x[,'Age'] <- x[,'Year']*as.integer(12) + -+ x[,'Month'] - as.integer(planeStart[x[,'TailNum']]) -\end{Sinput} -\end{Schunk} -This arithmetic is conducted on \proglang{R} vectors extracted from -the \code{big.matrix}; use of -\code{as.integer()} helps keep the memory consumption under control. - -\vspace*{0.5cm} -\noindent -{\bf Concluding Example: a Big Regression} -\vspace*{0.5cm} - -In addition to providing basic functions for exploratory data analysis, the -package \pkg{biganalytics} provides a wrapper for Thomas Lumley's -\pkg{biglm} package, supporting massive -linear and generalized linear models.\footnote{Package \pkg{biganalytics} -also provides \code{bigkmeans()}, and other analytics may be added to the -package in the future.} The following toy example examines -the airline arrival delays as a linear function of the age of the plane -at the time of the flight and the year of the flight. About 85 million -flights are used (because of missing airplane tailcodes). -We estimate that use of \proglang{R}'s \texttt{lm()} -function would require more than 10 GB of RAM of memory overhead, while -this example runs in about 3 minutes with only several hundred MB of memory -overhead. -\begin{Schunk} -\begin{Sinput} -> blm <- biglm.big.matrix(ArrDelay ~ Age + Year, data=x) -> summary(blm) -\end{Sinput} -\begin{Soutput} -Large data regression model: biglm(formula = formula, data = data, ...) -Sample size = 84216580 - Coef (95% CI) SE p -(Intercept) 91.6149 87.6509 95.5789 1.9820 0 -Age 0.0144 0.0142 0.0146 0.0001 0 -Year -0.0424 -0.0444 -0.0404 0.0010 0 -\end{Soutput} -\end{Schunk} -From this, we might conclude that older planes are associated with increased predicted -delays, and predicted delays in recent years are lower. However, this -exercise is merely for illustrative purposes; a serious study of airline delays would -quickly reject this oversimplification and discover problems with this particular -regression. - -\vspace*{0.5cm} -\noindent -{\bf Additional Information and Supporting Material} -\vspace*{0.5cm} - -These examples were tested both in Linux 64-bit and Windows 7 Enterprise 64-bit -environments. -Older versions of Windows operating systems (including Vista 64-bit) seem -to suffer from extremely inefficient caching behavior with filebackings and -are not recommended for use with -\pkg{bigmemory}; 32-bit environments will be limited by approximately 2 GB -of addressable memory. - -The packages are available via R-Forge and on CRAN as of -late April, 2010; please see -\url{http://www.bigmemory.org/} for more information. -There is a short vignette available in the Documentation area, -as well as presentation slides introducing \pkg{bigmemory} -and providing some benchmarks and shared-memory parallel programming -examples. Please do not use the older version of \pkg{bigmemory} -archived on CRAN (versions <= 3.12). - -\newpage - -\noindent -{\bf Citations} -\vspace*{0.5cm} - -\begin{enumerate} -\item The Bigmemory Project, \url{http://www.bigmemory.org/}, the home of \proglang{R} packages -\pkg{bigmemory}, \pkg{biganalytics}, \pkg{bigtabulate}, \pkg{bigalgebra}, and -\pkg{synchronicity}. Packages available from CRAN or R-Forge. - -\item 2009 JSM Data Expo: Airline on-time performance. \url {http://stat-computing.org/dataexpo/2009/}. - -\item Thomas Lumley (2009). \pkg{biglm}: bounded memory linear and generalized - linear models. \proglang{R} package version 0.7, - \url{http://CRAN.R-project.org/package=biglm}. - -\item \proglang{R} Development Core Team (2009). \proglang{R}: A language and environment for - statistical computing. \proglang{R} Foundation for Statistical Computing, - Vienna, Austria. ISBN 3-900051-07-0, \url{http://www.R-project.org}. - -\item Luke Tierney, A. J. Rossini, Na Li and H. Sevcikova (). \pkg{snow}: Simple - Network of Workstations. \proglang{R} package version 0.3-3, - \url{http://CRAN.R-project.org/package=snow}. - -\item Simon Urbanek (2009). \pkg{multicore}: Parallel processing of \proglang{R} code on - machines with multiple cores or CPUs. \proglang{R} package version 0.1-3, - \url{http://www.rforge.net/multicore/}. - -\item Stephen Weston and REvolution Computing (2009). \pkg{doMC}: Foreach parallel adaptor for the - \pkg{multicore} package. \proglang{R} package version 1.2.0, - \url{http://CRAN.R-project.org/package=doMC}. - -\item Stephen Weston and REvolution Computing (2009). \pkg{foreach}: Foreach looping -construct for \proglang{R}. \proglang{R} package version 1.3.0, -\url{http://CRAN.R-project.org/package=foreach}. - -\item Hao Yu (2010). \pkg{Rmpi}: Interface (Wrapper) to MPI (Message-Passing Interface). - \proglang{R} package version 0.5-8, \url{http://www.stats.uwo.ca/faculty/yu/Rmpi}. -\end{enumerate} - - -%\end{raggedright} - -\end{document} +% \VignetteIndexEntry{The Bigmemory Project Overview} +% \VignetteDepends{bigmemory} +% \VignettePackage{bigmemory} +% \VignetteEngine{knitr::knitr} +\documentclass[12pt]{article} + +\usepackage{graphics} +\usepackage{graphicx} + +\usepackage{accents} + +% New from euler: +\usepackage{ae} +\usepackage{color} +\usepackage{url} + +\topmargin=-0.85in +\textheight=9.5in +\textwidth=6.5in +\oddsidemargin=0in +%-0.25in + +%\usepackage{CJK} +%\usepackage{pinyin} +\def\E{\mathord{I\kern-.35em E}} +\def\R{\mathord{I\kern-.35em R}} +\def\P{\mathord{I\kern-.35em P}} +\def\I{\mathord{1\kern-.35em 1}} +\def\wt{\mathord{\widehat{\theta}}} + +\newcommand{\proglang}[1]{\textbf{#1}} +\newcommand{\pkg}[1]{\texttt{\textsl{#1}}} +\newcommand{\code}[1]{\texttt{#1}} +\newcommand{\mg}[1]{{\textcolor {magenta} {#1}}} +\newcommand{\gr}[1]{{\textcolor {green} {#1}}} +\newcommand{\bl}[1]{{\textcolor {blue} {#1}}} + +\newtheorem{thm}{Theorem}[section] +\newtheorem{myexplore}[thm]{Explore} +\newtheorem{mybackground}[thm]{Background} +\newtheorem{myquestion}[thm]{Question} +\newtheorem{myexample}[thm]{Example} +\newtheorem{mydefinition}[thm]{Definition} +\newtheorem{mytheorem}[thm]{Theorem} + +%\pagestyle{myheadings} % Go for customized headings +%\markboth{notused left title}{John W. Emerson, Department of Statistics, Yale University \copyright 2009} +%\newcommand{\sekshun}[1] % In 'article' only the page +% { % number appears in the header. +% \section{#1} % I want the section name AND +% \markboth{#1 \hfill}{#1 \hfill} % the page, so I need a new kind +% } % of '\sekshun' command. + +\begin{document} + +\setkeys{Gin}{width=1.0\textwidth} + +<>= +library(knitr) +opts_chunk$set( +fig.path='graphics/stat' +) +@ + + +<>= +options(keep.source = TRUE, width = 75) +@ + +\begin{center} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +{\Large\bf The Bigmemory Project} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\vspace*{0.5cm} +{\bf Michael J. Kane and John W. Emerson\\ +Yale University\\ +April 29, 2010} + +\vspace*{0.25cm} + +\end{center} + +%\begin{raggedright} +\parindent=0.5in + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\begin{quotation} +Multi-gigabyte data sets often challenge and frustrate \proglang{R} +users. \proglang{C/C++} programming can provide efficiencies, +but is cumbersome for interactive data analysis and +lacks the flexibility and power of \proglang{R}'s rich statistical +programming environment. The package \pkg{bigmemory} and sister +packages \pkg{biganalytics}, \pkg{synchronicity}, \pkg{bigtabulate}, +and \pkg{bigalgebra} bridge this gap, implementing massive matrices +and supporting their manipulation and exploration. +The data structures may be allocated to shared memory, allowing separate +processes on the same computer to share access to a single copy of the +data set. The data structures may also be file-backed, allowing users +to easily manage and analyze data sets larger than available RAM and +share them across nodes of a cluster. +These features of the Bigmemory Project open the door for powerful and +memory-efficient parallel analyses and data mining of massive data sets, +even on modest hardware. +\end{quotation} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\vspace*{0.5cm} +\noindent +{\bf Introductory Example: the 2009 JSM Data Expo} +\vspace*{0.5cm} + +Consider the complete airline on-time performance data +from the 2009 JSM Data Expo. The processed data set, \texttt{airline.csv}, +is approximately 11 GB (about 120 million rows and 29 columns) +with factors coded as integers (see \url{http://www.bigmemory.org/} for +processing information). +The \texttt{read.big.matrix()} call creates the binary +file-backing \texttt{airline.bin} +associated with the \texttt{big.matrix} object \texttt{x}. +Subsequent \proglang{R} sessions can attach instantly to \texttt{airline.bin} +without incurring the one-time overhead (about 25 minutes) +associated with creating the backing. +A summary of the entire data set is easily obtained using the new +\texttt{summary()} method. Note the surprising +presence of negative arrival and departure delays: exploratory data analysis +in action via \pkg{bigmemory}. The summary +only takes 3-4 minutes to process the 11 GB of data on a laptop with only +4 GB of RAM. +<>= +library(bigmemory) +library(biganalytics) +x <- read.big.matrix("airline.csv", type="integer", header=TRUE, + backingfile="airline.bin", + descriptorfile="airline.desc", + extraCols="Age") +summary(x) + +# min max mean NA's +#Year 1987 2008 1998.62 0 +#Month 1 12 6.55 0 +#DayofMonth 1 31 15.72 0 +#DayOfWeek 1 7 3.94 0 +#ArrDelay -1437 2598 7.05 2587529 +#DepDelay -1410 2601 8.17 2302136 +#... (other variables omitted here) ... +@ + + +\noindent +{\bf Overview} +\vspace*{0.5cm} + +Data frames and matrices in \proglang{R} are easy to use, +with typical manipulations executing quickly on +data sets much smaller than available RAM. They suit the needs of many +\proglang{R} users and work seamlessly with existing \proglang{R} functions +and packages. However, problems arise with larger data sets and when +increased memory requirements of parallel programming strain the system. + +The Bigmemory Project offers packages +for two purposes. First, \pkg{bigmemory}, \pkg{biganalytics}, and +\pkg{bigtabulate} have been designed to provide a minimalist, +elegant framework for users to manage and explore large data sets, even +on modest hardware (expensive workstations or clusters are not required). +The interface is designed to mimic \proglang{R}'s familiar \code{matrix} +syntax. Matthew Keller, Assistant Professor of +Psychology, University of Colorado at Boulder offered the following +testimonial about \pkg{bigmemory}: ``I love that it's intuitive and +doesn't require a lot of learning new ways to code things.'' + +Second, the packages of the Bigmemory Project provide a foundation for +memory-efficient parallel programming and can serve as building blocks +for developers of new high-performance computing tools in \proglang{R}. +When used in conjunction with a parallel package (such as \pkg{foreach}, +\pkg{snow}, \pkg{Rmpi}, or \pkg{multicore}, for example), +even shared-memory parallel-computing becomes +accessible to non-experts. +The programming interface is stable, and offers the flexibility to support +the development of +algorithms working seamlessly on both \texttt{big.matrix} and traditional +\texttt{matrix} objects. For examples of this, look first +at the function \texttt{mwhich()}; it offers flexible \texttt{which()}-like +functionality that is computationally efficient and avoids memory overhead. +In addition, all the functions provided by \pkg{bigtabulate} may be used +with \texttt{matrix} and \texttt{big.matrix} objects alike. + +\vspace*{0.5cm} +\noindent +{\bf Underneath the Hood of the Bigmemory Project} +\vspace*{0.5cm} + +The packages of the Bigmemory Project use the Boost Interprocess +\proglang{C++} library to provide platform-independent support for +massive matrices that may be shared across \proglang{R} processes. +Innovative use of \proglang{C++} accessors supports matrices of +\texttt{double}, \texttt{integer}, \texttt{short}, and \texttt{char}, +as well as the development of algorithms working seamlessly on +\texttt{big.matrix} objects or traditional \proglang{R} matrices. + +\vspace*{0.5cm} +\noindent +{\bf Example: Airplane Ages and Parallel Processing} +\vspace*{0.5cm} + +We would like to approximate the age of each plane at the time of +each flight. This first requires calculation of an approximate +``birthmonth'' for each plane: the month of the first +appearance in the data set. Given a matrix +\texttt{y} containing \texttt{Year} and \code{Month} for all flights +of a given plane, \texttt{birthmonth(y)} returns the +month (in months AD) of the earliest flight: +<>= +birthmonth <- function(y) { + minYear <- min(y[,'Year'], na.rm=TRUE) + these <- which(y[,'Year']==minYear) + minMonth <- min(y[these,'Month'], na.rm=TRUE) + return(12*minYear + minMonth - 1) +} +@ +A traditional approach to calculating all the birthmonths might use a \code{for()} loop: +<>= +allplanes <- unique(x[,'TailNum']) +planeStart <- rep(0, length(allplanes)) +for (i in allplanes) { + planeStart[i] <- birthmonth( x[mwhich(x, 'TailNum', i, 'eq'), + c('Year', 'Month'), drop=FALSE] ) +} +@ +With about 13,000 flights this takes about 9 hours, even with the relative +fast and memory-efficient use of \texttt{mwhich()}. + +A far more efficient alternative is to first obtain a list of row indices +for each plane: +<>= +library(bigtabulate) +planeindices <- bigsplit(x, 'TailNum') +@ +Here, the use of the new function \code{bigsplit()} is equivalent to +<>= +planeindices <- split(1:nrow(x), x[,'TailNum']) +@ +but is faster (16 versus 29 seconds) and more memory efficient (with +peak memory usage of 2 versus 3 GB). +Either way, +\texttt{planeindices[i]} contains all row indices corresponding to flights +with \texttt{TailNum} equal to \texttt{i}. This requires several hundred MB, +but is computationally more efficient in this problem. For example, +\texttt{planeindices} may be used with \code{sapply()} in the obvious way, +completing the task in a mere 30 seconds: +<>= +planeStart <- sapply(planeindices, + function(i) birthmonth(x[i, c('Year','Month'), + drop=FALSE])) +@ + +The looping structure \texttt{foreach()} of package \pkg{foreach} +can be a powerful and flexible alternative to \texttt{for()} or +functions like +\texttt{lapply()} and \texttt{sapply()}. It can also +take advantage of the shared-memory +capability of \pkg{bigmemory}. Package \pkg{doMC} provides one of several +available ``parallel backends'' for the function \texttt{foreach()}, allowing +the work to be automatically distributed to available processor cores: +<>= +library(doMC) +registerDoMC(cores=2) +planeStart <- foreach(i=planeindices, .combine=c) %dopar% { + return(birthmonth(x[i, c('Year','Month'), drop=FALSE])) +} +@ +The syntax of a \code{foreach()} loop is slightly different from the +syntax of a traditional loop, but its benefits are clear: +in this example, it takes only 14 seconds +to calculate the plane birthmonths using two processor cores.\footnote{We +should note that \pkg{doMC} and \pkg{multicore} are particularly well-suited +for this. When other parallel backends are used, one additional command is +required in the \code{birthmonth()} function: \code{x <- attach.big.matrix(xdesc)} +where \code{xdesc <- describe(x)} would be required just prior to the +\code{foreach()} loop, providing explicit shared-memory access across processes. +In contrast, \code{multicore} automatically operates on shared memory, +avoiding the need for this extra step.} +Both cores share access to the same master copy +of the airline data (with \texttt{Year} and \texttt{Month} cached in RAM); +individual calls to \texttt{birthmonth()} are relatively small in size. +Without the \texttt{registerDoMC()} +initialization, the \code{foreach()} loop would run on a single processor core, much +like \code{sapply()}, but taking about 24 seconds in this problem +with lower memory overhead than \code{sapply()}. + +Finally, the plane ages at the time of all flights may be calculated: +<>= +x[,'Age'] <- x[,'Year']*as.integer(12) + + x[,'Month'] - as.integer(planeStart[x[,'TailNum']]) +@ +This arithmetic is conducted on \proglang{R} vectors extracted from +the \code{big.matrix}; use of +\code{as.integer()} helps keep the memory consumption under control. + +\vspace*{0.5cm} +\noindent +{\bf Concluding Example: a Big Regression} +\vspace*{0.5cm} + +In addition to providing basic functions for exploratory data analysis, the +package \pkg{biganalytics} provides a wrapper for Thomas Lumley's +\pkg{biglm} package, supporting massive +linear and generalized linear models.\footnote{Package \pkg{biganalytics} +also provides \code{bigkmeans()}, and other analytics may be added to the +package in the future.} The following toy example examines +the airline arrival delays as a linear function of the age of the plane +at the time of the flight and the year of the flight. About 85 million +flights are used (because of missing airplane tailcodes). +We estimate that use of \proglang{R}'s \texttt{lm()} +function would require more than 10 GB of RAM of memory overhead, while +this example runs in about 3 minutes with only several hundred MB of memory +overhead. +<>= +blm <- biglm.big.matrix(ArrDelay ~ Age + Year, data=x) +summary(blm) +@ +<>= +#Large data regression model: biglm(formula = formula, data = data, ...) +#Sample size = 84216580 +# Coef (95% CI) SE p +#(Intercept) 91.6149 87.6509 95.5789 1.9820 0 +#Age 0.0144 0.0142 0.0146 0.0001 0 +#Year -0.0424 -0.0444 -0.0404 0.0010 0 +@ +From this, we might conclude that older planes are associated with increased predicted +delays, and predicted delays in recent years are lower. However, this +exercise is merely for illustrative purposes; a serious study of airline delays would +quickly reject this oversimplification and discover problems with this particular +regression. + +\vspace*{0.5cm} +\noindent +{\bf Additional Information and Supporting Material} +\vspace*{0.5cm} + +These examples were tested both in Linux 64-bit and Windows 7 Enterprise 64-bit +environments. +Older versions of Windows operating systems (including Vista 64-bit) seem +to suffer from extremely inefficient caching behavior with filebackings and +are not recommended for use with +\pkg{bigmemory}; 32-bit environments will be limited by approximately 2 GB +of addressable memory. + +The packages are available via R-Forge and on CRAN as of +late April, 2010; please see +\url{http://www.bigmemory.org/} for more information. +There is a short vignette available in the Documentation area, +as well as presentation slides introducing \pkg{bigmemory} +and providing some benchmarks and shared-memory parallel programming +examples. Please do not use the older version of \pkg{bigmemory} +archived on CRAN (versions <= 3.12). + +\newpage + +\noindent +{\bf Citations} +\vspace*{0.5cm} + +\begin{enumerate} +\item The Bigmemory Project, \url{http://www.bigmemory.org/}, the home of \proglang{R} packages +\pkg{bigmemory}, \pkg{biganalytics}, \pkg{bigtabulate}, \pkg{bigalgebra}, and +\pkg{synchronicity}. Packages available from CRAN or R-Forge. + +\item 2009 JSM Data Expo: Airline on-time performance. \url {http://stat-computing.org/dataexpo/2009/}. + +\item Thomas Lumley (2009). \pkg{biglm}: bounded memory linear and generalized + linear models. \proglang{R} package version 0.7, + \url{http://CRAN.R-project.org/package=biglm}. + +\item \proglang{R} Development Core Team (2009). \proglang{R}: A language and environment for + statistical computing. \proglang{R} Foundation for Statistical Computing, + Vienna, Austria. ISBN 3-900051-07-0, \url{http://www.R-project.org}. + +\item Luke Tierney, A. J. Rossini, Na Li and H. Sevcikova (). \pkg{snow}: Simple + Network of Workstations. \proglang{R} package version 0.3-3, + \url{http://CRAN.R-project.org/package=snow}. + +\item Simon Urbanek (2009). \pkg{multicore}: Parallel processing of \proglang{R} code on + machines with multiple cores or CPUs. \proglang{R} package version 0.1-3, + \url{http://www.rforge.net/multicore/}. + +\item Stephen Weston and REvolution Computing (2009). \pkg{doMC}: Foreach parallel adaptor for the + \pkg{multicore} package. \proglang{R} package version 1.2.0, + \url{http://CRAN.R-project.org/package=doMC}. + +\item Stephen Weston and REvolution Computing (2009). \pkg{foreach}: Foreach looping +construct for \proglang{R}. \proglang{R} package version 1.3.0, +\url{http://CRAN.R-project.org/package=foreach}. + +\item Hao Yu (2010). \pkg{Rmpi}: Interface (Wrapper) to MPI (Message-Passing Interface). + \proglang{R} package version 0.5-8, \url{http://www.stats.uwo.ca/faculty/yu/Rmpi}. +\end{enumerate} + + +%\end{raggedright} + +\end{document} diff --git a/inst/doc/vignette/Overview.pdf b/inst/doc/Overview.pdf similarity index 55% rename from inst/doc/vignette/Overview.pdf rename to inst/doc/Overview.pdf index f493947..13ce8b5 100644 Binary files a/inst/doc/vignette/Overview.pdf and b/inst/doc/Overview.pdf differ diff --git a/man-roxygen/attach.big.matrix_template.R b/man-roxygen/attach.big.matrix_template.R index 05079d5..2164d01 100644 --- a/man-roxygen/attach.big.matrix_template.R +++ b/man-roxygen/attach.big.matrix_template.R @@ -30,5 +30,28 @@ #' \email{} #' @seealso \code{\link{bigmemory}}, \code{\link{big.matrix}}, or the class #' documentation \code{\linkS4class{big.matrix}}. -#' @example examples/attach.big.matrix_examples.R +#' @examples \dontrun{ +#' # The example is quite silly, as you wouldn't likely do this in a +#' # single R session. But if zdescription were passed to another R session +#' # via SNOW, foreach, or even by a simple file read/write, +#' # then the attach of the second R process would give access to the +#' # same object in memory. Please see the package vignette for real examples. +#' +#' # Not run +#' z <- big.matrix(3, 3, type='integer', init=3) +#' z[,] +#' dim(z) +#' z[1,1] <- 2 +#' z[,] +#' zdescription <- describe(z) +#' zdescription +#' y <- attach.big.matrix(zdescription) +#' y[,] +#' y +#' z +#' zz <- attach.resource(zdescription) +#' zz[1,1] <- -100 +#' y[,] +#' z[,] +#' } #' @keywords classes methods diff --git a/man-roxygen/core_template.R b/man-roxygen/core_template.R index 6b28c34..44f9c8d 100644 --- a/man-roxygen/core_template.R +++ b/man-roxygen/core_template.R @@ -118,5 +118,60 @@ #' \code{\linkS4class{big.matrix}}; \code{\link{attach.big.matrix}} and #' \code{\link{describe}}. Sister packages \pkg{biganalytics}, \pkg{bigtabulate}, #' \pkg{synchronicity}, and \pkg{bigalgebra} provide advanced functionality. -#' @example examples/core_examples.R +#' @examples \dontrun{ +#' # Not Run +#' library(bigmemory) +#' x <- big.matrix(10, 2, type='integer', init=-5) +#' options(bigmemory.allow.dimnames=TRUE) +#' colnames(x) <- c("alpha", "beta") +#' is.big.matrix(x) +#' dim(x) +#' colnames(x) +#' rownames(x) +#' x[,] +#' x[1:8,1] <- 11:18 +#' colnames(x) <- NULL +#' x[,] +#' +#' # The following shared memory example is quite silly, as you wouldn't +#' # likely do this in a single R session. But if zdescription were +#' # passed to another R session via SNOW, foreach, or even by a +#' # simple file read/write, then the attach.big.matrix() within the +#' # second R process would give access to the same object in memory. +#' # Please see the package vignette for real examples. +#' +#' # Not run +#' z <- big.matrix(3, 3, type='integer', init=3) +#' z[,] +#' dim(z) +#' z[1,1] <- 2 +#' z[,] +#' zdescription <- describe(z) +#' zdescription +#' y <- attach.big.matrix(zdescription) +#' y[,] +#' y +#' z +#' y[1,1] <- -100 +#' y[,] +#' z[,] +#' +#' # A short filebacked example, showing the creation of associated files: +#' +#' files <- dir() +#' files[grep("example.bin", files)] +#' +#' z <- filebacked.big.matrix(3, 3, type='integer', init=123, +#' backingfile="example.bin", +#' descriptorfile="example.desc", +#' dimnames=list(c('a','b','c'), c('d', 'e', 'f'))) +#' z[,] +#' files <- dir() +#' files[grep("example.bin", files)] +#' zz <- attach.big.matrix("example.desc") +#' zz[,] +#' zz[1,1] <- 0 +#' zzz <- attach.big.matrix(describe(z)) +#' zzz[,] +#' } #' @keywords classes methods diff --git a/man-roxygen/deepcopy_template.R b/man-roxygen/deepcopy_template.R index b1c6cd8..3cd26e2 100644 --- a/man-roxygen/deepcopy_template.R +++ b/man-roxygen/deepcopy_template.R @@ -36,11 +36,11 @@ #' @seealso \code{\link{big.matrix}} #' @keywords methods #' @examples -#' -#' +#' \dontrun{ #' x <- as.big.matrix(matrix(1:30, 10, 3)) #' y <- deepcopy(x, -1) # Don't include the first column. #' x #' y #' head(x) #' head(y) +#' } diff --git a/man-roxygen/flush_template.R b/man-roxygen/flush_template.R index 19423cb..f7040bd 100644 --- a/man-roxygen/flush_template.R +++ b/man-roxygen/flush_template.R @@ -11,9 +11,12 @@ #' on flushing creates a bottleneck (likely near the threshold of available \acronym{RAM}). #' @return \code{TRUE} or \code{FALSE} (invisible), indicating whether or not the flush was successful. #' @author John W. Emerson and Michael J. Kane -#' @examples +#' @examples \dontrun{ #' x <- big.matrix(nrow=3, ncol=3, backingfile='flushtest.bin', #' descriptorfile='flushtest.desc', type='integer') #' x[1,1] <- 0 #' flush(x) +#' } +#' @docType methods +#' @rdname flush-methods #' @keywords methods diff --git a/man-roxygen/morder_template.R b/man-roxygen/morder_template.R index ad1790a..5319188 100644 --- a/man-roxygen/morder_template.R +++ b/man-roxygen/morder_template.R @@ -42,4 +42,12 @@ #' #' @author Michael J. Kane \email{} #' @seealso \code{\link{order}} -#' @example examples/morder_examples.R +#' @examples \dontrun{ +#' m = matrix(as.double(as.matrix(iris)), nrow=nrow(iris)) +#' morder(m, 1) +#' order(m[,1]) +#' +#' m[order(m[,1]), 2] +#' mpermute(m, cols=1) +#' m[,2] +#' } diff --git a/man-roxygen/mwhich_template.R b/man-roxygen/mwhich_template.R index 947dcbf..a94a4bb 100644 --- a/man-roxygen/mwhich_template.R +++ b/man-roxygen/mwhich_template.R @@ -38,5 +38,44 @@ #' @return a vector of row indices satisfying the criteria. #' @author John W. Emerson \email{} #' @seealso \code{\link{big.matrix}}, \code{\link{which}} -#' @example examples/mwhich_examples.R +#' @examples \dontrun{ +#' x <- as.big.matrix(matrix(1:30, 10, 3)) +#' options(bigmemory.allow.dimnames=TRUE) +#' colnames(x) <- c("A", "B", "C") +#' x[,] +#' x[mwhich(x, 1:2, list(c(2,3), c(11,17)), +#' list(c('ge','le'), c('gt', 'lt')), 'OR'),] +#' +#' x[mwhich(x, c("A","B"), list(c(2,3), c(11,17)), +#' list(c('ge','le'), c('gt', 'lt')), 'AND'),] +#' +#' # These should produce the same answer with a regular matrix: +#' y <- matrix(1:30, 10, 3) +#' y[mwhich(y, 1:2, list(c(2,3), c(11,17)), +#' list(c('ge','le'), c('gt', 'lt')), 'OR'),] +#' +#' y[mwhich(y, -3, list(c(2,3), c(11,17)), +#' list(c('ge','le'), c('gt', 'lt')), 'AND'),] +#' +#' +#' x[1,1] <- NA +#' mwhich(x, 1:2, NA, 'eq', 'OR') +#' mwhich(x, 1:2, NA, 'neq', 'AND') +#' +#' # Column 1 equal to 4 and/or column 2 less than or equal to 16: +#' mwhich(x, 1:2, list(4, 16), list('eq', 'le'), 'OR') +#' mwhich(x, 1:2, list(4, 16), list('eq', 'le'), 'AND') +#' +#' # Column 2 less than or equal to 15: +#' mwhich(x, 2, 15, 'le') +#' +#' # No NAs in either column, and column 2 strictly less than 15: +#' mwhich(x, c(1:2,2), list(NA, NA, 15), list('neq', 'neq', 'lt'), 'AND') +#' +#' x <- big.matrix(4, 2, init=1, type="double") +#' x[1,1] <- Inf +#' mwhich(x, 1, Inf, 'eq') +#' mwhich(x, 1, 1, 'gt') +#' mwhich(x, 1, 1, 'le') +#' } #' @keywords methods diff --git a/man-roxygen/sub.big.matrix_template.R b/man-roxygen/sub.big.matrix_template.R index 404107d..385d096 100644 --- a/man-roxygen/sub.big.matrix_template.R +++ b/man-roxygen/sub.big.matrix_template.R @@ -24,11 +24,12 @@ #' It is not a physical copy. Only contiguous blocks may form a submatrix. #' @author John W. Emerson and Michael J. Kane #' @seealso \code{\link{big.matrix}} -#' @examples +#' @examples \dontrun{ #' x <- big.matrix(10, 5, init=0, type="double") #' x[,] <- 1:50 #' y <- sub.big.matrix(x, 2, 9, 2, 3) #' y[,] #' y[1,1] <- -99 #' x[,] +#' } #' @keywords methods diff --git a/man-roxygen/write.big.matrix_template.R b/man-roxygen/write.big.matrix_template.R index beca85f..7a162d0 100644 --- a/man-roxygen/write.big.matrix_template.R +++ b/man-roxygen/write.big.matrix_template.R @@ -64,5 +64,40 @@ #' @author John W. Emerson and Michael J. Kane #' \email{} #' @seealso \code{\link{big.matrix}} -#' @example examples/write.big.matrix_examples.R +#' @examples \dontrun{ +#' # Without specifying the type, this big.matrix x will hold integers. +#' +#' x <- as.big.matrix(matrix(1:10, 5, 2)) +#' x[2,2] <- NA +#' x[,] +#' write.big.matrix(x, "foo.txt") +#' +#' # Just for fun, I'll read it back in as character (1-byte integers): +#' y <- read.big.matrix("foo.txt", type="char") +#' y[,] +#' +#' # Other examples: +#' w <- as.big.matrix(matrix(1:10, 5, 2), type='double') +#' w[1,2] <- NA +#' w[2,2] <- -Inf +#' w[3,2] <- Inf +#' w[4,2] <- NaN +#' w[,] +#' write.big.matrix(w, "bar.txt") +#' w <- read.big.matrix("bar.txt", type="double") +#' w[,] +#' w <- read.big.matrix("bar.txt", type="short") +#' w[,] +#' +#' # Another example using row names (which we don't like). +#' x <- as.big.matrix(as.matrix(iris), type='double') +#' rownames(x) <- as.character(1:nrow(x)) +#' head(x) +#' write.big.matrix(x, 'IrisData.txt', col.names=TRUE, row.names=TRUE) +#' y <- read.big.matrix("IrisData.txt", header=TRUE, has.row.names=TRUE) +#' head(y) +#' +#' # The following would fail with a dimension mismatch: +#' if (FALSE) y <- read.big.matrix("IrisData.txt", header=TRUE) +#' } #' @keywords methods diff --git a/man/GetMatrixSize.Rd b/man/GetMatrixSize.Rd new file mode 100644 index 0000000..2345151 --- /dev/null +++ b/man/GetMatrixSize.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{GetMatrixSize} +\alias{GetMatrixSize} +\title{big.matrix size} +\usage{ +GetMatrixSize(bigMat) +} +\arguments{ +\item{bigMat}{a \code{big.matrix} object} +} +\description{ +Returns the size of the created matrix in bytes +} + diff --git a/man/as.matrix-big.matrix-method.Rd b/man/as.matrix-big.matrix-method.Rd new file mode 100644 index 0000000..b7ba0de --- /dev/null +++ b/man/as.matrix-big.matrix-method.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{as.matrix,big.matrix-method} +\alias{as.matrix,big.matrix-method} +\title{Convert to base R matrix} +\usage{ +\S4method{as.matrix}{big.matrix}(x) +} +\arguments{ +\item{x}{A big.matrix object} +} +\description{ +Extract values from a \code{big.matrix} object +and convert to a base R matrix object +} + diff --git a/man/attach.big.matrix.Rd b/man/attach.big.matrix.Rd index 1afac00..7f1caad 100644 --- a/man/attach.big.matrix.Rd +++ b/man/attach.big.matrix.Rd @@ -44,11 +44,14 @@ A descriptor file is automatically created when a new filebacked \code{big.matrix} is created. } \examples{ +\dontrun{ # The example is quite silly, as you wouldn't likely do this in a # single R session. But if zdescription were passed to another R session # via SNOW, foreach, or even by a simple file read/write, # then the attach of the second R process would give access to the # same object in memory. Please see the package vignette for real examples. + +# Not run z <- big.matrix(3, 3, type='integer', init=3) z[,] dim(z) @@ -65,6 +68,7 @@ zz[1,1] <- -100 y[,] z[,] } +} \author{ Michael J. Kane and John W. Emerson \email{} diff --git a/man/big.matrix.Rd b/man/big.matrix.Rd index 38b01f5..aaaef6b 100644 --- a/man/big.matrix.Rd +++ b/man/big.matrix.Rd @@ -1,17 +1,26 @@ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/bigmemory.R +\docType{methods} \name{big.matrix} \alias{as.big.matrix} \alias{big.matrix} \alias{file.name} +\alias{file.name,big.matrix-method} \alias{filebacked.big.matrix} \alias{is.big.matrix} +\alias{is.big.matrix,ANY-method} +\alias{is.big.matrix,big.matrix-method} \alias{is.filebacked} +\alias{is.filebacked,big.matrix-method} \alias{is.nil} \alias{is.readonly} +\alias{is.readonly,big.matrix-method} \alias{is.separated} +\alias{is.separated,big.matrix-method} \alias{is.shared} +\alias{is.shared,big.matrix-method} \alias{shared.name} +\alias{shared.name,big.matrix-method} \title{The core "big.matrix" operations.} \usage{ big.matrix(nrow, ncol, type = options()$bigmemory.default.type, init = NULL, @@ -29,18 +38,34 @@ as.big.matrix(x, type = NULL, separated = FALSE, backingfile = NULL, is.big.matrix(x) +\S4method{is.big.matrix}{big.matrix}(x) + +\S4method{is.big.matrix}{ANY}(x) + is.separated(x) +\S4method{is.separated}{big.matrix}(x) + is.filebacked(x) +\S4method{is.filebacked}{big.matrix}(x) + shared.name(x) +\S4method{shared.name}{big.matrix}(x) + file.name(x) +\S4method{file.name}{big.matrix}(x) + is.shared(x) +\S4method{is.shared}{big.matrix}(x) + is.readonly(x) +\S4method{is.readonly}{big.matrix}(x) + is.nil(address) } \arguments{ @@ -175,6 +200,9 @@ factors numeric before forming the \code{big.matrix}. Level labels are not preserved and must be managed by the user if desired. } \examples{ +\dontrun{ +# Not Run +library(bigmemory) x <- big.matrix(10, 2, type='integer', init=-5) options(bigmemory.allow.dimnames=TRUE) colnames(x) <- c("alpha", "beta") @@ -194,6 +222,7 @@ x[,] # second R process would give access to the same object in memory. # Please see the package vignette for real examples. +# Not run z <- big.matrix(3, 3, type='integer', init=3) z[,] dim(z) @@ -210,8 +239,10 @@ y[,] z[,] # A short filebacked example, showing the creation of associated files: + files <- dir() files[grep("example.bin", files)] + z <- filebacked.big.matrix(3, 3, type='integer', init=123, backingfile="example.bin", descriptorfile="example.desc", @@ -224,8 +255,7 @@ zz[,] zz[1,1] <- 0 zzz <- attach.big.matrix(describe(z)) zzz[,] - -is.nil(z@address) +} } \author{ John W. Emerson and Michael J. Kane diff --git a/man/big.matrix.descriptor-class.Rd b/man/big.matrix.descriptor-class.Rd index b06f900..c167aeb 100644 --- a/man/big.matrix.descriptor-class.Rd +++ b/man/big.matrix.descriptor-class.Rd @@ -16,6 +16,25 @@ \S4method{attach.resource}{big.matrix.descriptor}(obj, ...) } +\arguments{ +\item{x}{A descriptor object} + +\item{firstRow}{the first row of the submatrix} + +\item{lastRow}{the last row of the submatrix if not NULL} + +\item{firstCol}{the first column of the submatrix} + +\item{lastCol}{of the submatrix if not NULL} + +\item{backingpath}{required path to the filebacked object, if applicable} + +\item{obj}{The filename of the descriptor for a filebacked matrix, +assumed ot be in the directory specified} + +\item{...}{possibly \code{path} which gives the path where the descriptor +and/or filebacking can be found.} +} \description{ An object of this class contains necessary and sufficient information to ``attach'' a shared or filebacked \code{\link{big.matrix}}. diff --git a/man/deepcopy.Rd b/man/deepcopy.Rd index 99687f8..6dc4207 100644 --- a/man/deepcopy.Rd +++ b/man/deepcopy.Rd @@ -58,6 +58,7 @@ traditional syntax would only copy the object (the pointer to the It can also make a copy of only a subset of columns. } \examples{ +\dontrun{ x <- as.big.matrix(matrix(1:30, 10, 3)) y <- deepcopy(x, -1) # Don't include the first column. x @@ -65,6 +66,7 @@ y head(x) head(y) } +} \seealso{ \code{\link{big.matrix}} } diff --git a/man/dim-big.matrix-method.Rd b/man/dim-big.matrix-method.Rd new file mode 100644 index 0000000..3c64ad1 --- /dev/null +++ b/man/dim-big.matrix-method.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{dim,big.matrix-method} +\alias{dim,big.matrix-method} +\title{Dimensions of a big.matrix object} +\usage{ +\S4method{dim}{big.matrix}(x) +} +\arguments{ +\item{x}{A \code{big.matrix} object} +} +\description{ +Retrieve the dimensions of a \code{big.matrix} object +} + diff --git a/man/dimnames-methods.Rd b/man/dimnames-methods.Rd new file mode 100644 index 0000000..78bb6c7 --- /dev/null +++ b/man/dimnames-methods.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{dimnames,big.matrix-method} +\alias{dimnames,big.matrix-method} +\alias{dimnames<-,big.matrix,list-method} +\title{Dimnames of a big.matrix Object} +\usage{ +\S4method{dimnames}{big.matrix}(x) + +\S4method{dimnames}{big.matrix,list}(x) <- value +} +\arguments{ +\item{x}{A big.matrix object} + +\item{value}{A possible value for \code{dimnames(x)}} +} +\description{ +Retrieve or set the dimnames of an object +} + diff --git a/man/extract-methods.Rd b/man/extract-methods.Rd new file mode 100644 index 0000000..3c8612e --- /dev/null +++ b/man/extract-methods.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{Extract,big.matrix} +\alias{Extract,big.matrix} +\alias{[,big.matrix,ANY,ANY,logical-method} +\alias{[,big.matrix,ANY,ANY,missing-method} +\alias{[,big.matrix,ANY,missing,logical-method} +\alias{[,big.matrix,ANY,missing,missing-method} +\alias{[,big.matrix,matrix,missing,missing-method} +\alias{[,big.matrix,missing,ANY,logical-method} +\alias{[,big.matrix,missing,ANY,missing-method} +\alias{[,big.matrix,missing,missing,logical-method} +\alias{[,big.matrix,missing,missing,missing-method} +\alias{[<-,big.matrix,ANY,ANY-method} +\alias{[<-,big.matrix,ANY,missing-method} +\alias{[<-,big.matrix,matrix,missing-method} +\alias{[<-,big.matrix,missing,ANY-method} +\alias{[<-,big.matrix,missing,missing-method} +\title{Extract or Replace big.matrix elements} +\usage{ +\S4method{[}{big.matrix,ANY,ANY,missing}(x, i, j, drop) + +\S4method{[}{big.matrix,ANY,ANY,logical}(x, i, j, drop) + +\S4method{[}{big.matrix,missing,ANY,missing}(x, i, j, drop) + +\S4method{[}{big.matrix,missing,ANY,logical}(x, i, j, drop) + +\S4method{[}{big.matrix,ANY,missing,missing}(x, i, j, drop) + +\S4method{[}{big.matrix,ANY,missing,logical}(x, i, j, drop) + +\S4method{[}{big.matrix,missing,missing,missing}(x, i, j, drop) + +\S4method{[}{big.matrix,missing,missing,logical}(x, i, j, drop) + +\S4method{[}{big.matrix,matrix,missing,missing}(x, i, j, drop) + +\S4method{[}{big.matrix,ANY,ANY}(x, i, j) <- value + +\S4method{[}{big.matrix,missing,ANY}(x, i, j) <- value + +\S4method{[}{big.matrix,ANY,missing}(x, i, j) <- value + +\S4method{[}{big.matrix,missing,missing}(x, i, j) <- value + +\S4method{[}{big.matrix,matrix,missing}(x, i, j) <- value +} +\arguments{ +\item{x}{A \code{big.matrix object}} + +\item{i}{Indices specifying the rows} + +\item{j}{Indices specifying the columns} + +\item{drop}{Logical indication if reduce to minimum dimensions} + +\item{value}{typically an array-like R object of similar class} +} +\description{ +Extract or Replace big.matrix elements +} + diff --git a/man/flush.Rd b/man/flush-methods.Rd similarity index 91% rename from man/flush.Rd rename to man/flush-methods.Rd index d6047a4..405bcd7 100644 --- a/man/flush.Rd +++ b/man/flush-methods.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/bigmemory.R +\docType{methods} \name{flush} \alias{flush} +\alias{flush,big.matrix-method} \title{Updating a big.matrix filebacking.} \usage{ flush(con) + +\S4method{flush}{big.matrix}(con) } \arguments{ \item{con}{filebacked \code{\link{big.matrix}}.} @@ -23,11 +27,13 @@ This function flushes any modified data (in \acronym{RAM}) of a file-backed on flushing creates a bottleneck (likely near the threshold of available \acronym{RAM}). } \examples{ +\dontrun{ x <- big.matrix(nrow=3, ncol=3, backingfile='flushtest.bin', descriptorfile='flushtest.desc', type='integer') x[1,1] <- 0 flush(x) } +} \author{ John W. Emerson and Michael J. Kane } diff --git a/man/head-methods.Rd b/man/head-methods.Rd new file mode 100644 index 0000000..a7de2fd --- /dev/null +++ b/man/head-methods.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{head,big.matrix-method} +\alias{head,big.matrix-method} +\alias{tail,big.matrix-method} +\title{Return First or Last Part of a big.matrix Object} +\usage{ +\S4method{head}{big.matrix}(x, n = 6) + +\S4method{tail}{big.matrix}(x, n = 6) +} +\arguments{ +\item{x}{A big.matrix object} + +\item{n}{A single integer for the number of rows to return} +} +\description{ +Returns the first or last parts of a \code{big.matrix} +object. +} + diff --git a/man/is.float-numeric-method.Rd b/man/is.float-numeric-method.Rd new file mode 100644 index 0000000..9f6d016 --- /dev/null +++ b/man/is.float-numeric-method.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{is.float,numeric-method} +\alias{is.float,numeric-method} +\title{Is Float?} +\usage{ +\S4method{is.float}{numeric}(x) +} +\arguments{ +\item{x}{A numeric value} +} +\description{ +Check if R numeric value has float flag +} + diff --git a/man/is.float.Rd b/man/is.float.Rd new file mode 100644 index 0000000..5fa0eff --- /dev/null +++ b/man/is.float.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\name{is.float} +\alias{is.float} +\title{Check if Float} +\usage{ +is.float(x) +} +\arguments{ +\item{x}{An object to be evaluated if float} +} +\description{ +Check if Float +} + diff --git a/man/length-big.matrix-method.Rd b/man/length-big.matrix-method.Rd new file mode 100644 index 0000000..87bda17 --- /dev/null +++ b/man/length-big.matrix-method.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{length,big.matrix-method} +\alias{length,big.matrix-method} +\title{Length of a big.matrix object} +\usage{ +\S4method{length}{big.matrix}(x) +} +\arguments{ +\item{x}{A \code{big.matrix} object} +} +\description{ +Get the length of a \code{big.matrix} object +} + diff --git a/man/morder.Rd b/man/morder.Rd index 60fb1bf..ce0abdb 100644 --- a/man/morder.Rd +++ b/man/morder.Rd @@ -62,6 +62,7 @@ this function has side-effects, that is \code{x} is changed when this function is called. } \examples{ +\dontrun{ m = matrix(as.double(as.matrix(iris)), nrow=nrow(iris)) morder(m, 1) order(m[,1]) @@ -70,6 +71,7 @@ m[order(m[,1]), 2] mpermute(m, cols=1) m[,2] } +} \author{ Michael J. Kane \email{} } diff --git a/man/mwhich.Rd b/man/mwhich.Rd index 4c45d78..4bfdb8a 100644 --- a/man/mwhich.Rd +++ b/man/mwhich.Rd @@ -53,6 +53,7 @@ and \code{\link{intersect}}, for example) on the results of multiple pointer trick (accessor) in \acronym{C++}. } \examples{ +\dontrun{ x <- as.big.matrix(matrix(1:30, 10, 3)) options(bigmemory.allow.dimnames=TRUE) colnames(x) <- c("A", "B", "C") @@ -60,7 +61,7 @@ x[,] x[mwhich(x, 1:2, list(c(2,3), c(11,17)), list(c('ge','le'), c('gt', 'lt')), 'OR'),] -x[mwhich(x, c("A","B"), list(c(2,3), c(11,17)), +x[mwhich(x, c("A","B"), list(c(2,3), c(11,17)), list(c('ge','le'), c('gt', 'lt')), 'AND'),] # These should produce the same answer with a regular matrix: @@ -92,6 +93,7 @@ mwhich(x, 1, Inf, 'eq') mwhich(x, 1, 1, 'gt') mwhich(x, 1, 1, 'le') } +} \author{ John W. Emerson \email{} } diff --git a/man/ncol-methods.Rd b/man/ncol-methods.Rd new file mode 100644 index 0000000..e63195f --- /dev/null +++ b/man/ncol-methods.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{ncol,big.matrix-method} +\alias{ncol,big.matrix-method} +\alias{nrow,big.matrix-method} +\title{The Number of Rows/Columns of a big.matrix} +\usage{ +\S4method{ncol}{big.matrix}(x) + +\S4method{nrow}{big.matrix}(x) +} +\arguments{ +\item{x}{A big.matrix object} +} +\value{ +An integer of length 1 +} +\description{ +\code{nrow} and \code{ncol} return the number of +rows or columns present in a \code{big.matrix} object. +} + diff --git a/man/print-big.matrix-method.Rd b/man/print-big.matrix-method.Rd new file mode 100644 index 0000000..ff2005c --- /dev/null +++ b/man/print-big.matrix-method.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{print,big.matrix-method} +\alias{print,big.matrix-method} +\title{Print Values} +\usage{ +\S4method{print}{big.matrix}(x) +} +\arguments{ +\item{x}{A \code{big.matrix} object} +} +\description{ +\code{print} will print out the elements within +a \code{big.matrix} object. +} +\note{ +By default, this will only return the \code{head} of a big.matrix +to prevent console overflow. If you trun off the bigmemory.print.warning +option then it will convert to a base R matrix and print all elements. +} + diff --git a/man/sub.big.matrix.Rd b/man/sub.big.matrix.Rd index 7942efb..126c37c 100644 --- a/man/sub.big.matrix.Rd +++ b/man/sub.big.matrix.Rd @@ -1,14 +1,22 @@ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/bigmemory.R +\docType{methods} \name{is.sub.big.matrix} \alias{is.sub.big.matrix} +\alias{is.sub.big.matrix,big.matrix-method} \alias{sub.big.matrix} +\alias{sub.big.matrix,big.matrix-method} \title{Submatrix support} \usage{ is.sub.big.matrix(x) +\S4method{is.sub.big.matrix}{big.matrix}(x) + sub.big.matrix(x, firstRow = 1, lastRow = NULL, firstCol = 1, lastCol = NULL, backingpath = NULL) + +\S4method{sub.big.matrix}{big.matrix}(x, firstRow = 1, lastRow = NULL, + firstCol = 1, lastCol = NULL, backingpath = NULL) } \arguments{ \item{x}{either a \code{\link{big.matrix}} or a descriptor.} @@ -42,6 +50,7 @@ object that references a contiguous set of columns and rows of another otherwise. } \examples{ +\dontrun{ x <- big.matrix(10, 5, init=0, type="double") x[,] <- 1:50 y <- sub.big.matrix(x, 2, 9, 2, 3) @@ -49,6 +58,7 @@ y[,] y[1,1] <- -99 x[,] } +} \author{ John W. Emerson and Michael J. Kane } diff --git a/man/typeof-big.matrix-method.Rd b/man/typeof-big.matrix-method.Rd new file mode 100644 index 0000000..d6c3a54 --- /dev/null +++ b/man/typeof-big.matrix-method.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/bigmemory.R +\docType{methods} +\name{typeof,big.matrix-method} +\alias{typeof,big.matrix-method} +\title{The Type of a big.matrix Object} +\usage{ +\S4method{typeof}{big.matrix}(x) +} +\arguments{ +\item{x}{A \code{big.matrix} object} +} +\description{ +\code{typeof} returns the storage type of a +\code{big.matrix} object +} + diff --git a/man/write.big.matrix.Rd b/man/write.big.matrix.Rd index 3b86d78..2584c8f 100644 --- a/man/write.big.matrix.Rd +++ b/man/write.big.matrix.Rd @@ -1,18 +1,30 @@ % Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/bigmemory.R +\docType{methods} \name{write.big.matrix} \alias{read.big.matrix} +\alias{read.big.matrix,character-method} \alias{write.big.matrix} +\alias{write.big.matrix,big.matrix,character-method} \title{File interface for a ``big.matrix''} \usage{ write.big.matrix(x, filename, row.names = FALSE, col.names = FALSE, sep = ",") +\S4method{write.big.matrix}{big.matrix,character}(x, filename, + row.names = FALSE, col.names = FALSE, sep = ",") + read.big.matrix(filename, sep = ",", header = FALSE, col.names = NULL, row.names = NULL, has.row.names = FALSE, ignore.row.names = FALSE, type = NA, skip = 0, separated = FALSE, backingfile = NULL, backingpath = NULL, descriptorfile = NULL, binarydescriptor = FALSE, extraCols = NULL, shared = TRUE) + +\S4method{read.big.matrix}{character}(filename, sep = ",", header = FALSE, + col.names = NULL, row.names = NULL, has.row.names = FALSE, + ignore.row.names = FALSE, type = NA, skip = 0, separated = FALSE, + backingfile = NULL, backingpath = NULL, descriptorfile = NULL, + binarydescriptor = FALSE, extraCols = NULL, shared = TRUE) } \arguments{ \item{x}{a \code{\link{big.matrix}}.} @@ -96,7 +108,9 @@ Or perhaps to specify columns targeted for factor or character conversion to numeric values. Would you use such features? Email us and let us know! } \examples{ +\dontrun{ # Without specifying the type, this big.matrix x will hold integers. + x <- as.big.matrix(matrix(1:10, 5, 2)) x[2,2] <- NA x[,] @@ -130,6 +144,7 @@ head(y) # The following would fail with a dimension mismatch: if (FALSE) y <- read.big.matrix("IrisData.txt", header=TRUE) } +} \author{ John W. Emerson and Michael J. Kane \email{} diff --git a/src/bigmemory.cpp b/src/bigmemory.cpp index 65d87bf..8304f54 100644 --- a/src/bigmemory.cpp +++ b/src/bigmemory.cpp @@ -1768,6 +1768,9 @@ Rcpp::String GetTypeString( SEXP bigMatAddr ) * quick function to access big.matrix sizes * possibly convert in to a method for object.size??? */ +//' @title big.matrix size +//' @description Returns the size of the created matrix in bytes +//' @param bigMat a \code{big.matrix} object //' @export // [[Rcpp::export]] SEXP GetMatrixSize( SEXP bigMat ) diff --git a/tests/testthat/test_create.R b/tests/testthat/test_create.R index 7657aa1..596d88c 100644 --- a/tests/testthat/test_create.R +++ b/tests/testthat/test_create.R @@ -39,4 +39,9 @@ test_that("attach methods successful",{ x <- attach.big.matrix(bmdescription) expect_false(identical(z@address, y@address)) expect_identical(bm[,], x[,]) -}) \ No newline at end of file +}) + +rm(z) +gc() +file.remove('example.bin') +file.remove('example.desc') diff --git a/tests/testthat/test_float_type.R b/tests/testthat/test_float_type.R index ac75a1c..b9985c9 100644 --- a/tests/testthat/test_float_type.R +++ b/tests/testthat/test_float_type.R @@ -54,3 +54,8 @@ test_that("Proper warning returned", { float type downcast") }) +rm(z) +gc() +file.remove('example.bin') +file.remove('example.desc') + diff --git a/tests/testthat/test_matrix_manips.R b/tests/testthat/test_matrix_manips.R index 97edcc1..3738905 100644 --- a/tests/testthat/test_matrix_manips.R +++ b/tests/testthat/test_matrix_manips.R @@ -46,3 +46,8 @@ test_that("flush works correctly",{ expect_warning(flush(bm), info="You cannot call flush on a non-filebacked big.matrix") }) + +rm(z) +gc() +file.remove('example.bin') +file.remove('example.desc') diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R index 65f74a3..f9124bd 100644 --- a/tests/testthat/test_misc.R +++ b/tests/testthat/test_misc.R @@ -37,4 +37,9 @@ test_that("dimnames returned are correct", { expect_identical(dimnames(mat), dimnames(z), info = "dimnames don't match between filebacked.big.matrix and matrix") -}) \ No newline at end of file +}) + +rm(z) +gc() +file.remove('example.bin') +file.remove('example.desc') diff --git a/tests/testthat/test_read.R b/tests/testthat/test_read.R index efca2ad..77b8a7c 100644 --- a/tests/testthat/test_read.R +++ b/tests/testthat/test_read.R @@ -58,4 +58,4 @@ test_that("test_read", { expect_identical(bmnull[, ], matnull[, ], info = "full matrix without names") } return(TRUE) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test_readonly.R b/tests/testthat/test_readonly.R index 7367757..d0755b2 100644 --- a/tests/testthat/test_readonly.R +++ b/tests/testthat/test_readonly.R @@ -9,8 +9,8 @@ fbm.file = "fbm" fbm.desc.file = "fbm.desc" fbm.desc.path = file.path(back.dir,fbm.desc.file) fbm.data.path = file.path(back.dir,fbm.file) -fbm = filebacked.big.matrix(3,3,dimnames=list(rownames,colnames),backingpath=back.dir, backingfile=fbm.file, descriptorfile=paste(fbm.file,".desc",sep="")) -fbm[,] = 1:9 +# fbm = filebacked.big.matrix(3,3,dimnames=list(rownames,colnames),backingpath=back.dir, backingfile=fbm.file, descriptorfile=paste(fbm.file,".desc",sep="")) +# fbm[,] = 1:9 bm = big.matrix(3,3,dimnames=list(rownames,colnames)) @@ -39,6 +39,13 @@ test_that("test_readonly", { expect_error({ bm2[matrix(c(1, 2, 2, 2), ncol = 2), ] = 100 }, info = "Writing subset by matrix to a big.matrix made read-only by FS before attached gives error") + + # in order to reuse, must remove prior objects +# rm(fbm) +# gc() +# file.remove(file.path(back.dir, fbm.file)) +# file.remove(file.path(back.dir, fbm.desc.file)) + fbm = filebacked.big.matrix(3, 3, dimnames = list(rownames, colnames), backingpath = back.dir, backingfile = fbm.file, descriptorfile = fbm.desc.file) @@ -83,4 +90,11 @@ test_that("test_readonly", { fbm3[1, 1] = 100 }, info = "Should give error if you ask for a readonly matrix and try to write to it.") return(TRUE) -}) \ No newline at end of file + + rm(fbm, fbm2, fbm3) + gc() + file.remove(file.path(back.dir, fbm.file)) + file.remove(file.path(back.dir, fbm.desc.file)) +}) + + diff --git a/vignettes/Overview.Rnw b/vignettes/Overview.Rnw new file mode 100644 index 0000000..39ce880 --- /dev/null +++ b/vignettes/Overview.Rnw @@ -0,0 +1,409 @@ +% \VignetteIndexEntry{The Bigmemory Project Overview} +% \VignetteDepends{bigmemory} +% \VignettePackage{bigmemory} +% \VignetteEngine{knitr::knitr} +\documentclass[12pt]{article} + +\usepackage{graphics} +\usepackage{graphicx} + +\usepackage{accents} + +% New from euler: +\usepackage{ae} +\usepackage{color} +\usepackage{url} + +\topmargin=-0.85in +\textheight=9.5in +\textwidth=6.5in +\oddsidemargin=0in +%-0.25in + +%\usepackage{CJK} +%\usepackage{pinyin} +\def\E{\mathord{I\kern-.35em E}} +\def\R{\mathord{I\kern-.35em R}} +\def\P{\mathord{I\kern-.35em P}} +\def\I{\mathord{1\kern-.35em 1}} +\def\wt{\mathord{\widehat{\theta}}} + +\newcommand{\proglang}[1]{\textbf{#1}} +\newcommand{\pkg}[1]{\texttt{\textsl{#1}}} +\newcommand{\code}[1]{\texttt{#1}} +\newcommand{\mg}[1]{{\textcolor {magenta} {#1}}} +\newcommand{\gr}[1]{{\textcolor {green} {#1}}} +\newcommand{\bl}[1]{{\textcolor {blue} {#1}}} + +\newtheorem{thm}{Theorem}[section] +\newtheorem{myexplore}[thm]{Explore} +\newtheorem{mybackground}[thm]{Background} +\newtheorem{myquestion}[thm]{Question} +\newtheorem{myexample}[thm]{Example} +\newtheorem{mydefinition}[thm]{Definition} +\newtheorem{mytheorem}[thm]{Theorem} + +%\pagestyle{myheadings} % Go for customized headings +%\markboth{notused left title}{John W. Emerson, Department of Statistics, Yale University \copyright 2009} +%\newcommand{\sekshun}[1] % In 'article' only the page +% { % number appears in the header. +% \section{#1} % I want the section name AND +% \markboth{#1 \hfill}{#1 \hfill} % the page, so I need a new kind +% } % of '\sekshun' command. + +\begin{document} + +\setkeys{Gin}{width=1.0\textwidth} + +<>= +library(knitr) +opts_chunk$set( +fig.path='graphics/stat' +) +@ + + +<>= +options(keep.source = TRUE, width = 75) +@ + +\begin{center} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +{\Large\bf The Bigmemory Project} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\vspace*{0.5cm} +{\bf Michael J. Kane and John W. Emerson\\ +Yale University\\ +April 29, 2010} + +\vspace*{0.25cm} + +\end{center} + +%\begin{raggedright} +\parindent=0.5in + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\begin{quotation} +Multi-gigabyte data sets often challenge and frustrate \proglang{R} +users. \proglang{C/C++} programming can provide efficiencies, +but is cumbersome for interactive data analysis and +lacks the flexibility and power of \proglang{R}'s rich statistical +programming environment. The package \pkg{bigmemory} and sister +packages \pkg{biganalytics}, \pkg{synchronicity}, \pkg{bigtabulate}, +and \pkg{bigalgebra} bridge this gap, implementing massive matrices +and supporting their manipulation and exploration. +The data structures may be allocated to shared memory, allowing separate +processes on the same computer to share access to a single copy of the +data set. The data structures may also be file-backed, allowing users +to easily manage and analyze data sets larger than available RAM and +share them across nodes of a cluster. +These features of the Bigmemory Project open the door for powerful and +memory-efficient parallel analyses and data mining of massive data sets, +even on modest hardware. +\end{quotation} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\vspace*{0.5cm} +\noindent +{\bf Introductory Example: the 2009 JSM Data Expo} +\vspace*{0.5cm} + +Consider the complete airline on-time performance data +from the 2009 JSM Data Expo. The processed data set, \texttt{airline.csv}, +is approximately 11 GB (about 120 million rows and 29 columns) +with factors coded as integers (see \url{http://www.bigmemory.org/} for +processing information). +The \texttt{read.big.matrix()} call creates the binary +file-backing \texttt{airline.bin} +associated with the \texttt{big.matrix} object \texttt{x}. +Subsequent \proglang{R} sessions can attach instantly to \texttt{airline.bin} +without incurring the one-time overhead (about 25 minutes) +associated with creating the backing. +A summary of the entire data set is easily obtained using the new +\texttt{summary()} method. Note the surprising +presence of negative arrival and departure delays: exploratory data analysis +in action via \pkg{bigmemory}. The summary +only takes 3-4 minutes to process the 11 GB of data on a laptop with only +4 GB of RAM. +<>= +library(bigmemory) +library(biganalytics) +x <- read.big.matrix("airline.csv", type="integer", header=TRUE, + backingfile="airline.bin", + descriptorfile="airline.desc", + extraCols="Age") +summary(x) + +# min max mean NA's +#Year 1987 2008 1998.62 0 +#Month 1 12 6.55 0 +#DayofMonth 1 31 15.72 0 +#DayOfWeek 1 7 3.94 0 +#ArrDelay -1437 2598 7.05 2587529 +#DepDelay -1410 2601 8.17 2302136 +#... (other variables omitted here) ... +@ + + +\noindent +{\bf Overview} +\vspace*{0.5cm} + +Data frames and matrices in \proglang{R} are easy to use, +with typical manipulations executing quickly on +data sets much smaller than available RAM. They suit the needs of many +\proglang{R} users and work seamlessly with existing \proglang{R} functions +and packages. However, problems arise with larger data sets and when +increased memory requirements of parallel programming strain the system. + +The Bigmemory Project offers packages +for two purposes. First, \pkg{bigmemory}, \pkg{biganalytics}, and +\pkg{bigtabulate} have been designed to provide a minimalist, +elegant framework for users to manage and explore large data sets, even +on modest hardware (expensive workstations or clusters are not required). +The interface is designed to mimic \proglang{R}'s familiar \code{matrix} +syntax. Matthew Keller, Assistant Professor of +Psychology, University of Colorado at Boulder offered the following +testimonial about \pkg{bigmemory}: ``I love that it's intuitive and +doesn't require a lot of learning new ways to code things.'' + +Second, the packages of the Bigmemory Project provide a foundation for +memory-efficient parallel programming and can serve as building blocks +for developers of new high-performance computing tools in \proglang{R}. +When used in conjunction with a parallel package (such as \pkg{foreach}, +\pkg{snow}, \pkg{Rmpi}, or \pkg{multicore}, for example), +even shared-memory parallel-computing becomes +accessible to non-experts. +The programming interface is stable, and offers the flexibility to support +the development of +algorithms working seamlessly on both \texttt{big.matrix} and traditional +\texttt{matrix} objects. For examples of this, look first +at the function \texttt{mwhich()}; it offers flexible \texttt{which()}-like +functionality that is computationally efficient and avoids memory overhead. +In addition, all the functions provided by \pkg{bigtabulate} may be used +with \texttt{matrix} and \texttt{big.matrix} objects alike. + +\vspace*{0.5cm} +\noindent +{\bf Underneath the Hood of the Bigmemory Project} +\vspace*{0.5cm} + +The packages of the Bigmemory Project use the Boost Interprocess +\proglang{C++} library to provide platform-independent support for +massive matrices that may be shared across \proglang{R} processes. +Innovative use of \proglang{C++} accessors supports matrices of +\texttt{double}, \texttt{integer}, \texttt{short}, and \texttt{char}, +as well as the development of algorithms working seamlessly on +\texttt{big.matrix} objects or traditional \proglang{R} matrices. + +\vspace*{0.5cm} +\noindent +{\bf Example: Airplane Ages and Parallel Processing} +\vspace*{0.5cm} + +We would like to approximate the age of each plane at the time of +each flight. This first requires calculation of an approximate +``birthmonth'' for each plane: the month of the first +appearance in the data set. Given a matrix +\texttt{y} containing \texttt{Year} and \code{Month} for all flights +of a given plane, \texttt{birthmonth(y)} returns the +month (in months AD) of the earliest flight: +<>= +birthmonth <- function(y) { + minYear <- min(y[,'Year'], na.rm=TRUE) + these <- which(y[,'Year']==minYear) + minMonth <- min(y[these,'Month'], na.rm=TRUE) + return(12*minYear + minMonth - 1) +} +@ +A traditional approach to calculating all the birthmonths might use a \code{for()} loop: +<>= +allplanes <- unique(x[,'TailNum']) +planeStart <- rep(0, length(allplanes)) +for (i in allplanes) { + planeStart[i] <- birthmonth( x[mwhich(x, 'TailNum', i, 'eq'), + c('Year', 'Month'), drop=FALSE] ) +} +@ +With about 13,000 flights this takes about 9 hours, even with the relative +fast and memory-efficient use of \texttt{mwhich()}. + +A far more efficient alternative is to first obtain a list of row indices +for each plane: +<>= +library(bigtabulate) +planeindices <- bigsplit(x, 'TailNum') +@ +Here, the use of the new function \code{bigsplit()} is equivalent to +<>= +planeindices <- split(1:nrow(x), x[,'TailNum']) +@ +but is faster (16 versus 29 seconds) and more memory efficient (with +peak memory usage of 2 versus 3 GB). +Either way, +\texttt{planeindices[i]} contains all row indices corresponding to flights +with \texttt{TailNum} equal to \texttt{i}. This requires several hundred MB, +but is computationally more efficient in this problem. For example, +\texttt{planeindices} may be used with \code{sapply()} in the obvious way, +completing the task in a mere 30 seconds: +<>= +planeStart <- sapply(planeindices, + function(i) birthmonth(x[i, c('Year','Month'), + drop=FALSE])) +@ + +The looping structure \texttt{foreach()} of package \pkg{foreach} +can be a powerful and flexible alternative to \texttt{for()} or +functions like +\texttt{lapply()} and \texttt{sapply()}. It can also +take advantage of the shared-memory +capability of \pkg{bigmemory}. Package \pkg{doMC} provides one of several +available ``parallel backends'' for the function \texttt{foreach()}, allowing +the work to be automatically distributed to available processor cores: +<>= +library(doMC) +registerDoMC(cores=2) +planeStart <- foreach(i=planeindices, .combine=c) %dopar% { + return(birthmonth(x[i, c('Year','Month'), drop=FALSE])) +} +@ +The syntax of a \code{foreach()} loop is slightly different from the +syntax of a traditional loop, but its benefits are clear: +in this example, it takes only 14 seconds +to calculate the plane birthmonths using two processor cores.\footnote{We +should note that \pkg{doMC} and \pkg{multicore} are particularly well-suited +for this. When other parallel backends are used, one additional command is +required in the \code{birthmonth()} function: \code{x <- attach.big.matrix(xdesc)} +where \code{xdesc <- describe(x)} would be required just prior to the +\code{foreach()} loop, providing explicit shared-memory access across processes. +In contrast, \code{multicore} automatically operates on shared memory, +avoiding the need for this extra step.} +Both cores share access to the same master copy +of the airline data (with \texttt{Year} and \texttt{Month} cached in RAM); +individual calls to \texttt{birthmonth()} are relatively small in size. +Without the \texttt{registerDoMC()} +initialization, the \code{foreach()} loop would run on a single processor core, much +like \code{sapply()}, but taking about 24 seconds in this problem +with lower memory overhead than \code{sapply()}. + +Finally, the plane ages at the time of all flights may be calculated: +<>= +x[,'Age'] <- x[,'Year']*as.integer(12) + + x[,'Month'] - as.integer(planeStart[x[,'TailNum']]) +@ +This arithmetic is conducted on \proglang{R} vectors extracted from +the \code{big.matrix}; use of +\code{as.integer()} helps keep the memory consumption under control. + +\vspace*{0.5cm} +\noindent +{\bf Concluding Example: a Big Regression} +\vspace*{0.5cm} + +In addition to providing basic functions for exploratory data analysis, the +package \pkg{biganalytics} provides a wrapper for Thomas Lumley's +\pkg{biglm} package, supporting massive +linear and generalized linear models.\footnote{Package \pkg{biganalytics} +also provides \code{bigkmeans()}, and other analytics may be added to the +package in the future.} The following toy example examines +the airline arrival delays as a linear function of the age of the plane +at the time of the flight and the year of the flight. About 85 million +flights are used (because of missing airplane tailcodes). +We estimate that use of \proglang{R}'s \texttt{lm()} +function would require more than 10 GB of RAM of memory overhead, while +this example runs in about 3 minutes with only several hundred MB of memory +overhead. +<>= +blm <- biglm.big.matrix(ArrDelay ~ Age + Year, data=x) +summary(blm) +@ +<>= +#Large data regression model: biglm(formula = formula, data = data, ...) +#Sample size = 84216580 +# Coef (95% CI) SE p +#(Intercept) 91.6149 87.6509 95.5789 1.9820 0 +#Age 0.0144 0.0142 0.0146 0.0001 0 +#Year -0.0424 -0.0444 -0.0404 0.0010 0 +@ +From this, we might conclude that older planes are associated with increased predicted +delays, and predicted delays in recent years are lower. However, this +exercise is merely for illustrative purposes; a serious study of airline delays would +quickly reject this oversimplification and discover problems with this particular +regression. + +\vspace*{0.5cm} +\noindent +{\bf Additional Information and Supporting Material} +\vspace*{0.5cm} + +These examples were tested both in Linux 64-bit and Windows 7 Enterprise 64-bit +environments. +Older versions of Windows operating systems (including Vista 64-bit) seem +to suffer from extremely inefficient caching behavior with filebackings and +are not recommended for use with +\pkg{bigmemory}; 32-bit environments will be limited by approximately 2 GB +of addressable memory. + +The packages are available via R-Forge and on CRAN as of +late April, 2010; please see +\url{http://www.bigmemory.org/} for more information. +There is a short vignette available in the Documentation area, +as well as presentation slides introducing \pkg{bigmemory} +and providing some benchmarks and shared-memory parallel programming +examples. Please do not use the older version of \pkg{bigmemory} +archived on CRAN (versions <= 3.12). + +\newpage + +\noindent +{\bf Citations} +\vspace*{0.5cm} + +\begin{enumerate} +\item The Bigmemory Project, \url{http://www.bigmemory.org/}, the home of \proglang{R} packages +\pkg{bigmemory}, \pkg{biganalytics}, \pkg{bigtabulate}, \pkg{bigalgebra}, and +\pkg{synchronicity}. Packages available from CRAN or R-Forge. + +\item 2009 JSM Data Expo: Airline on-time performance. \url {http://stat-computing.org/dataexpo/2009/}. + +\item Thomas Lumley (2009). \pkg{biglm}: bounded memory linear and generalized + linear models. \proglang{R} package version 0.7, + \url{http://CRAN.R-project.org/package=biglm}. + +\item \proglang{R} Development Core Team (2009). \proglang{R}: A language and environment for + statistical computing. \proglang{R} Foundation for Statistical Computing, + Vienna, Austria. ISBN 3-900051-07-0, \url{http://www.R-project.org}. + +\item Luke Tierney, A. J. Rossini, Na Li and H. Sevcikova (). \pkg{snow}: Simple + Network of Workstations. \proglang{R} package version 0.3-3, + \url{http://CRAN.R-project.org/package=snow}. + +\item Simon Urbanek (2009). \pkg{multicore}: Parallel processing of \proglang{R} code on + machines with multiple cores or CPUs. \proglang{R} package version 0.1-3, + \url{http://www.rforge.net/multicore/}. + +\item Stephen Weston and REvolution Computing (2009). \pkg{doMC}: Foreach parallel adaptor for the + \pkg{multicore} package. \proglang{R} package version 1.2.0, + \url{http://CRAN.R-project.org/package=doMC}. + +\item Stephen Weston and REvolution Computing (2009). \pkg{foreach}: Foreach looping +construct for \proglang{R}. \proglang{R} package version 1.3.0, +\url{http://CRAN.R-project.org/package=foreach}. + +\item Hao Yu (2010). \pkg{Rmpi}: Interface (Wrapper) to MPI (Message-Passing Interface). + \proglang{R} package version 0.5-8, \url{http://www.stats.uwo.ca/faculty/yu/Rmpi}. +\end{enumerate} + + +%\end{raggedright} + +\end{document}