diff --git a/NAMESPACE b/NAMESPACE index 86e482e3..7c2a7e56 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(addCorFlex) export(addCorGen) export(addMultiFac) export(addPeriods) +export(betaGetShapes) export(catProbs) export(defCondition) export(defData) diff --git a/R/betaGetShapes.R b/R/betaGetShapes.R new file mode 100644 index 00000000..4b4b8af4 --- /dev/null +++ b/R/betaGetShapes.R @@ -0,0 +1,34 @@ +#' Convert beta mean and dispersion parameters to two shape parameters +#' +#' @param mean The mean of a beta distribution +#' @param dispersion The dispersion parameter of a beta distribution +#' @return A list that includes the shape parameters of the beta distribution +#' @details In simstudy, users specify the beta distribution as a function of two parameters - +#' a mean and dispersion, where 0 < mean < 1 and dispersion > 0. In this case, the variance +#' of the specified distribution is (mean)*(1-mean)/(1+dispersion). The base R function rbeta +#' uses the two shape parameters to specify the beta distribution. This function converts +#' the mean and dispersion into the shape1 and shape2 parameters. +#' @examples +#' set.seed(12345) +#' mean = 0.3; dispersion = 1.6 +#' rs <- betaGetShapes(mean, dispersion) +#' c(rs$shape1, rs$shape2) +#' vec <- rbeta(1000, shape1 = rs$shape1, shape2 = rs$shape2) +#' (estMoments <- c(mean(vec), var(vec))) +#' (theoryMoments <- c(mean, mean*(1-mean)/(1+dispersion))) +#' (theoryMoments <- with(rs, c(shape1/(shape1 + shape2), +#' (shape1*shape2) / ((shape1 + shape2)^2*(1 + shape1 + shape2))))) +#' @export + +betaGetShapes <- function(mean, dispersion) { + + if (any(mean <= 0) | any(mean >= 1)) stop("Mean out of range: must be between 0 and 1") + if (any(dispersion <= 0)) stop("Mean out of range: must be greater than 0") + + a <- mean * dispersion + b <- (1 - mean) * dispersion + + + return(list(shape1 = a, shape2 = b)) + +} diff --git a/R/int_evalDef.R b/R/int_evalDef.R index 566a77fb..c1768743 100644 --- a/R/int_evalDef.R +++ b/R/int_evalDef.R @@ -26,6 +26,7 @@ evalDef <- function(newvar, newform, newdist, defVars) { if (newdist %in% c("uniform","uniformInt") & nparam != 2) { stop("Uniform (continuous & integer) requires min and max", call. = FALSE) } + if (newdist == "categorical" & nparam < 2) { stop("Categorical distribution requires 2 or more probabilities", call. = FALSE) @@ -71,7 +72,7 @@ evalDef <- function(newvar, newform, newdist, defVars) { # Make sure that distribution is allowed if (!(newdist %in% c("normal","binary", "binomial","poisson","noZeroPoisson", - "uniform","categorical","gamma","nonrandom", + "uniform","categorical","gamma","beta","nonrandom", "uniformInt", "negBinomial", "exponential" ))) { diff --git a/R/int_genbeta.R b/R/int_genbeta.R new file mode 100644 index 00000000..93b2e842 --- /dev/null +++ b/R/int_genbeta.R @@ -0,0 +1,36 @@ +#### Beta distribution #### + +# Internal function called by generate - returns exp data +# +# @param n The number of observations required in the data set +# @param formula String that specifies the mean (lambda) +# @return A data.frame column with the updated simulated data + +getBetaMean <- function(dtSim, formula, link ) { + + if (link == "logit") { + + logit <- with(dtSim, eval(parse(text = as.character(formula)))) + mean <- 1 / (1 + exp(-logit)) + + } else { + + mean <- with(dtSim,eval(parse(text = as.character(formula)))) + } + + return(mean) + +} + +genbeta <- function(n, formula, dispersion, link="identity", dtSim) { + + mean <- getBetaMean(dtSim, formula, link) + + d <- as.numeric(as.character(dispersion)) + + sr <- betaGetShapes(mean = mean, dispersion = d) + new <- stats::rbeta(n, shape = sr$shape1, shape2 = sr$shape2) + + return(new) + +} diff --git a/R/int_generate.R b/R/int_generate.R index d89c92f6..cb233c59 100644 --- a/R/int_generate.R +++ b/R/int_generate.R @@ -13,7 +13,7 @@ generate <- function(args, n, dfSim, idname) { if (! args$dist %in% c("normal", "poisson", "noZeroPoisson", "binary", "binomial","uniform", "uniformInt", - "categorical", "gamma", "negBinomial", + "categorical", "gamma", "beta", "negBinomial", "nonrandom", "exponential")) { stop(paste(args$dist, "not a valid distribution. Please check spelling."), call. = FALSE) } @@ -39,6 +39,8 @@ generate <- function(args, n, dfSim, idname) { newColumn <- genexp(n, args$formula, args$link, dfSim) } else if (args$dist == "gamma") { newColumn <- gengamma(n, args$formula, args$variance, args$link, dfSim) + } else if (args$dist == "beta") { + newColumn <- genbeta(n, args$formula, args$variance, args$link, dfSim) } else if (args$dist == "negBinomial") { newColumn <- gennegbinom(n, args$formula, args$variance, args$link, dfSim) } else if (args$dist == "nonrandom") { diff --git a/man/betaGetShapes.Rd b/man/betaGetShapes.Rd new file mode 100644 index 00000000..749e5e33 --- /dev/null +++ b/man/betaGetShapes.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/betaGetShapes.R +\name{betaGetShapes} +\alias{betaGetShapes} +\title{Convert beta mean and dispersion parameters to two shape parameters} +\usage{ +betaGetShapes(mean, dispersion) +} +\arguments{ +\item{mean}{The mean of a beta distribution} + +\item{dispersion}{The dispersion parameter of a beta distribution} +} +\value{ +A list that includes the shape parameters of the beta distribution +} +\description{ +Convert beta mean and dispersion parameters to two shape parameters +} +\details{ +In simstudy, users specify the beta distribution as a function of two parameters - +a mean and dispersion, where 0 < mean < 1 and dispersion > 0. In this case, the variance +of the specified distribution is (mean)*(1-mean)/(1+dispersion). The base R function rbeta +uses the two shape parameters to specify the beta distribution. This function converts +the mean and dispersion into the shape1 and shape2 parameters. +} +\examples{ +set.seed(12345) +mean = 0.3; dispersion = 1.6 +rs <- betaGetShapes(mean, dispersion) +c(rs$shape1, rs$shape2) +vec <- rbeta(1000, shape1 = rs$shape1, shape2 = rs$shape2) +(estMoments <- c(mean(vec), var(vec))) +(theoryMoments <- c(mean, mean*(1-mean)/(1+dispersion))) +(theoryMoments <- with(rs, c(shape1/(shape1 + shape2), + (shape1*shape2) / ((shape1 + shape2)^2*(1 + shape1 + shape2))))) +} diff --git a/vignettes/simstudy.Rmd b/vignettes/simstudy.Rmd index 5ba2c51e..ca498e3c 100644 --- a/vignettes/simstudy.Rmd +++ b/vignettes/simstudy.Rmd @@ -295,6 +295,7 @@ def <- defData(def,varname="y2", dist="poisson", formula="nr - 0.2 * x1",link="l def <- defData(def, varname = "xnb", dist = "negBinomial" , formula="nr - 0.2 * x1", variance = 0.05, link = "log") def <- defData(def,varname="xCat",formula = "0.3;0.2;0.5", dist="categorical") def <- defData(def,varname="g1", dist="gamma", formula = "5+xCat", variance = 1, link = "log") +def <- defData(def,varname="b1", dist="beta", formula = "1+0.3*xCat", variance = 1, link = "logit") def <- defData(def, varname = "a1", dist = "binary" , formula="-3 + xCat", link="logit") def <- defData(def, varname = "a2", dist = "binomial" , formula="-3 + xCat", variance = 100, link="logit") @@ -311,15 +312,16 @@ def <- defData(def,varname="y2",dist="poisson",formula="nr - 0.2 * x1",link="log def <- defData(def, varname = "xnb", dist = "negBinomial" , formula="nr - 0.2 * x1", variance = 0.05, link = "log") def <- defData(def,varname="xCat",formula = "0.3;0.2;0.5",dist="categorical") def <- defData(def,varname="g1", dist="gamma", formula = "5+xCat", variance = 1, link = "log") +def <- defData(def,varname="b1", dist="beta", formula = "1+0.3*xCat", variance = 1, link = "logit") def <- defData(def, varname = "a1", dist = "binary" , formula="-3 + xCat", link="logit") def <- defData(def, varname = "a2", dist = "binomial" , formula="-3 + xCat", variance = 100, link="logit") ``` The first call to `defData` without specifying a definition name (in this example the definition name is *def*) creates a **new** data.table with a single row. An additional row is added to the table `def` each time the function `defData` is called. Each of these calls is the definition of a new field in the data set that will be generated. In this example, the first data field is named 'nr', defined as a constant with a value to be 7. In each call to `defData` the user defines a variable name, a distribution (the default is 'normal'), a mean formula (if applicable), a variance parameter (if applicable), and a link function for the mean (defaults to 'identity').\ -The possible distributions include **normal**, **gamma**, **poisson**, **zero-truncated poisson**, **negative binomial**, **binary**, **binomial**, **uniform**, **uniform integer**, **categorical**, and **deterministic/non-random**. For all of these distributions, key parameters defining the distribution are entered in the `formula`, `variance`, and `link` fields. +The possible distributions include **normal**, **gamma**, **poisson**, **zero-truncated poisson**, **negative binomial**, **binary**, **binomial**, **beta**, **uniform**, **uniform integer**, **categorical**, and **deterministic/non-random**. For all of these distributions, key parameters defining the distribution are entered in the `formula`, `variance`, and `link` fields. -In the case of the **normal**, **gamma**, and **negative binomial** distributions, the formula specifies the mean. The formula can be a scalar value (number) or a string that represents a function of previously defined variables in the data set definition (or, as we will see later, in a previously generated data set). In the example, the mean of `y1`, a normally distributed value, is declared as a linear function of `nr` and `x1`, and the mean of `g1` is a function of the category defined by `xCat`. The `variance` field is defined only for normal and gamma random variables, and can only be defined as a scalar value. In the case of gamma random and negative binomial variables, the value entered in variance field is really a dispersion value $d$. The variance of a gamma distributed variable will be $d \times mean^2$, and for a negative binomial distributed variable, the variance will be $mean + d*mean^2$. \ +In the case of the **normal**, **gamma**, **beta**, and **negative binomial** distributions, the formula specifies the mean. The formula can be a scalar value (number) or a string that represents a function of previously defined variables in the data set definition (or, as we will see later, in a previously generated data set). In the example, the mean of `y1`, a normally distributed value, is declared as a linear function of `nr` and `x1`, and the mean of `g1` is a function of the category defined by `xCat`. The `variance` field is defined only for normal, gamma, beta, and negative binomial random variables, and can only be defined as a scalar value. In the case of gamma, beta, and negative binomial variables, the value entered in variance field is really a dispersion value $d$. The variance of a gamma distributed variable will be $d \times mean^2$, for a beta distributed variable will be $mean \times (1- mean)/(1 + d)$, and for a negative binomial distributed variable, the variance will be $mean + d*mean^2$. \ In the case of the **poisson**, **zero-truncated poisson**, and **binary** distributions, the formula also specifies the mean. The variance is not a valid parameter in these cases, but the `link` field is. The default link is 'identity' but a 'log' link is available for the Poisson distributions and a "logit" link is available for the binary outcomes. In this example, `y2` is defined as Poisson random variable with a mean that is function of `nr` and `x1` on the log scale. For binary variables, which take a value of 0 or 1, the formula represents probability (with the 'identity' link) or log odds (with the 'logit' link) of the variable having a value of 1. In the example, `a1` has been defined as a binary random variable with a log odds that is a function of `xCat`. \