Adding beta distribution

kgoldfeld · Aug 27, 2018 · f34ce5b · f34ce5b
1 parent a9839ab
commit f34ce5b
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 4 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(addCorFlex)
 export(addCorGen)
 export(addMultiFac)
 export(addPeriods)
+export(betaGetShapes)
 export(catProbs)
 export(defCondition)
 export(defData)

diff --git a/R/betaGetShapes.R b/R/betaGetShapes.R
@@ -0,0 +1,34 @@
+#' Convert beta mean and dispersion parameters to two shape parameters
+#'
+#' @param mean The mean of a beta distribution
+#' @param dispersion The dispersion parameter of a beta distribution
+#' @return A list that includes the shape parameters of the beta distribution
+#' @details In simstudy, users specify the beta distribution as a function of two parameters - 
+#' a mean and dispersion, where 0 < mean < 1 and dispersion > 0. In this case, the variance 
+#' of the specified distribution is (mean)*(1-mean)/(1+dispersion). The base R function rbeta 
+#' uses the two shape parameters to specify the beta distribution. This function converts 
+#' the mean and dispersion into the shape1 and shape2 parameters.
+#' @examples
+#' set.seed(12345)
+#' mean = 0.3; dispersion = 1.6
+#' rs <- betaGetShapes(mean, dispersion)
+#' c(rs$shape1, rs$shape2)
+#' vec <- rbeta(1000, shape1 = rs$shape1, shape2 = rs$shape2)
+#' (estMoments <- c(mean(vec), var(vec)))
+#' (theoryMoments <- c(mean, mean*(1-mean)/(1+dispersion)))
+#' (theoryMoments <- with(rs, c(shape1/(shape1 + shape2), 
+#'   (shape1*shape2) / ((shape1 + shape2)^2*(1 + shape1 + shape2)))))
+#' @export
+
+betaGetShapes <- function(mean, dispersion) {
+
+  if (any(mean <= 0) | any(mean >= 1)) stop("Mean out of range: must be between 0 and 1")
+  if (any(dispersion <= 0)) stop("Mean out of range: must be greater than 0")
+
+  a <- mean * dispersion
+  b <- (1 - mean) * dispersion
+
+
+  return(list(shape1 = a, shape2 = b))
+
+}
diff --git a/R/int_evalDef.R b/R/int_evalDef.R
@@ -26,6 +26,7 @@ evalDef <- function(newvar, newform, newdist, defVars) {
   if (newdist %in% c("uniform","uniformInt") & nparam != 2) {
       stop("Uniform (continuous & integer) requires min and max", call. = FALSE)
   }
+
 
   if (newdist == "categorical" & nparam < 2) {
     stop("Categorical distribution requires 2 or more probabilities", call. = FALSE)
@@ -71,7 +72,7 @@ evalDef <- function(newvar, newform, newdist, defVars) {
   # Make sure that distribution is allowed
 
   if (!(newdist %in% c("normal","binary", "binomial","poisson","noZeroPoisson",
-                       "uniform","categorical","gamma","nonrandom",
+                       "uniform","categorical","gamma","beta","nonrandom",
                        "uniformInt", "negBinomial", "exponential"
   ))) {
 

diff --git a/R/int_genbeta.R b/R/int_genbeta.R
@@ -0,0 +1,36 @@
+#### Beta distribution ####
+
+# Internal function called by generate - returns exp data
+#
+# @param n The number of observations required in the data set
+# @param formula String that specifies the mean (lambda)
+# @return A data.frame column with the updated simulated data
+
+getBetaMean <- function(dtSim, formula, link ) {
+
+  if (link == "logit") {
+
+    logit <- with(dtSim, eval(parse(text = as.character(formula))))
+    mean <- 1 / (1 + exp(-logit))
+
+  } else {
+
+    mean <- with(dtSim,eval(parse(text = as.character(formula))))
+  }
+
+  return(mean)
+
+}
+
+genbeta <- function(n, formula, dispersion, link="identity", dtSim) {
+
+  mean <- getBetaMean(dtSim, formula, link)
+
+  d <- as.numeric(as.character(dispersion))
+
+  sr <- betaGetShapes(mean = mean, dispersion = d)
+  new <- stats::rbeta(n, shape = sr$shape1, shape2 = sr$shape2)
+
+  return(new)
+
+}
diff --git a/R/int_generate.R b/R/int_generate.R
@@ -13,7 +13,7 @@ generate <- function(args, n, dfSim, idname) {
 
   if (! args$dist %in% c("normal", "poisson", "noZeroPoisson", "binary", 
                          "binomial","uniform", "uniformInt",
-                         "categorical", "gamma", "negBinomial",
+                         "categorical", "gamma", "beta", "negBinomial",
                          "nonrandom", "exponential")) {
     stop(paste(args$dist, "not a valid distribution. Please check spelling."), call. = FALSE)
   }
@@ -39,6 +39,8 @@ generate <- function(args, n, dfSim, idname) {
     newColumn <- genexp(n, args$formula, args$link, dfSim)
   } else if (args$dist == "gamma") {
     newColumn <- gengamma(n, args$formula, args$variance, args$link, dfSim)
+  } else if (args$dist == "beta") {
+    newColumn <- genbeta(n, args$formula, args$variance, args$link, dfSim)
   } else if (args$dist == "negBinomial") {
     newColumn <- gennegbinom(n, args$formula, args$variance, args$link, dfSim)
   } else if (args$dist == "nonrandom") {

diff --git a/man/betaGetShapes.Rd b/man/betaGetShapes.Rd
diff --git a/vignettes/simstudy.Rmd b/vignettes/simstudy.Rmd
@@ -295,6 +295,7 @@ def <- defData(def,varname="y2", dist="poisson", formula="nr - 0.2 * x1",link="l
 def <- defData(def, varname = "xnb", dist = "negBinomial" , formula="nr - 0.2 * x1", variance = 0.05, link = "log")
 def <- defData(def,varname="xCat",formula = "0.3;0.2;0.5", dist="categorical")
 def <- defData(def,varname="g1", dist="gamma", formula = "5+xCat", variance = 1, link = "log")
+def <- defData(def,varname="b1", dist="beta", formula = "1+0.3*xCat", variance = 1, link = "logit")
 def <- defData(def, varname = "a1", dist = "binary" , formula="-3 + xCat", link="logit")
 def <- defData(def, varname = "a2", dist = "binomial" , formula="-3 + xCat", variance = 100, link="logit")
 
@@ -311,15 +312,16 @@ def <- defData(def,varname="y2",dist="poisson",formula="nr - 0.2 * x1",link="log
 def <- defData(def, varname = "xnb", dist = "negBinomial" , formula="nr - 0.2 * x1", variance = 0.05, link = "log")
 def <- defData(def,varname="xCat",formula = "0.3;0.2;0.5",dist="categorical")
 def <- defData(def,varname="g1", dist="gamma", formula = "5+xCat", variance = 1, link = "log")
+def <- defData(def,varname="b1", dist="beta", formula = "1+0.3*xCat", variance = 1, link = "logit")
 def <- defData(def, varname = "a1", dist = "binary" , formula="-3 + xCat", link="logit")
 def <- defData(def, varname = "a2", dist = "binomial" , formula="-3 + xCat", variance = 100, link="logit")
 ```
 
 The first call to `defData` without specifying a definition name (in this example the definition name is *def*) creates a **new** data.table with a single row. An additional row is added to the table `def` each time the function `defData` is called. Each of these calls is the definition of a new field in the data set that will be generated. In this example, the first data field is named 'nr', defined as a constant with a value to be 7. In each call to `defData` the user defines a variable name, a distribution (the default is 'normal'), a mean formula (if applicable), a variance parameter (if applicable), and a link function for the mean (defaults to 'identity').\
 
-The possible distributions include **normal**, **gamma**, **poisson**, **zero-truncated poisson**, **negative binomial**, **binary**, **binomial**, **uniform**, **uniform integer**, **categorical**, and **deterministic/non-random**. For all of these distributions, key parameters defining the distribution are entered in the `formula`, `variance`, and `link` fields. 
+The possible distributions include **normal**, **gamma**, **poisson**, **zero-truncated poisson**, **negative binomial**, **binary**, **binomial**, **beta**, **uniform**, **uniform integer**, **categorical**, and **deterministic/non-random**. For all of these distributions, key parameters defining the distribution are entered in the `formula`, `variance`, and `link` fields. 
 
-In the case of the **normal**, **gamma**, and **negative binomial** distributions, the formula specifies the mean. The formula can be a scalar value (number) or a string that represents a function of previously defined variables in the data set definition (or, as we will see later, in a previously generated data set). In the example, the mean of `y1`, a normally distributed value, is declared as a linear function of `nr` and `x1`, and the mean of `g1` is a function of the category defined by `xCat`. The `variance` field is defined only for normal and gamma random variables, and can only be defined as a scalar value. In the case of gamma random and negative binomial variables, the value entered in variance field is really a dispersion value $d$. The variance of a gamma distributed variable will be $d \times mean^2$, and for a negative binomial distributed variable, the variance will be $mean + d*mean^2$. \
+In the case of the **normal**, **gamma**, **beta**, and **negative binomial** distributions, the formula specifies the mean. The formula can be a scalar value (number) or a string that represents a function of previously defined variables in the data set definition (or, as we will see later, in a previously generated data set). In the example, the mean of `y1`, a normally distributed value, is declared as a linear function of `nr` and `x1`, and the mean of `g1` is a function of the category defined by `xCat`. The `variance` field is defined only for normal, gamma, beta, and negative binomial random variables, and can only be defined as a scalar value. In the case of gamma, beta, and negative binomial variables, the value entered in variance field is really a dispersion value $d$. The variance of a gamma distributed variable will be $d \times mean^2$, for a beta distributed variable will be $mean \times (1- mean)/(1 + d)$, and for a negative binomial distributed variable, the variance will be $mean + d*mean^2$. \
 
 In the case of the **poisson**, **zero-truncated poisson**, and **binary** distributions, the formula also specifies the mean. The variance is not a valid parameter in these cases, but the `link` field is. The default link is 'identity' but a 'log' link is available for the Poisson distributions and a "logit" link is available for the binary outcomes. In this example, `y2` is defined as Poisson random variable with a mean that is function of `nr` and `x1` on the log scale. For binary variables, which take a value of 0 or 1, the formula represents probability (with the 'identity' link) or log odds (with the 'logit' link) of the variable having a value of 1. In the example, `a1` has been defined as a binary random variable with a log odds that is a function of `xCat`.  \