-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate_data.R
136 lines (130 loc) · 4.67 KB
/
generate_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#' Generate simulated data
#'
#' Generate data for a regular monitoring design.
#' The counts follow a negative binomial distribution with given size parameters
#' and the true mean mu depending on a year, period and site effect.
#' All effects are independent from each other and have, on the log-scale, a
#' normal distribution with zero mean and given standard deviation.
#' @param intercept The global mean on the log-scale.
#' @param n_year The number of years.
#' @param n_period The number of periods.
#' @param n_site The number of sites.
#' @param year_factor Convert year to a factor.
#' Defaults to `FALSE`.
#' @param period_factor Convert period to a factor.
#' Defaults to `FALSE`.
#' @param site_factor Convert site to a factor.
#' Defaults to `FALSE`.
#' @param trend The long-term linear trend on the log-scale.
#' @param sd_rw_year The standard deviation of the year effects on the
#' log-scale.
#' @param amplitude_period The amplitude of the periodic effect on the
#' log-scale.
#' @param mean_phase_period The mean of the phase of the periodic effect among
#' years.
#' Defaults to `0`.
#' @param sd_phase_period The standard deviation of the phase of the periodic
#' effect among years.
#' @param sd_site The standard deviation of the site effects on the log-scale.
#' @param sd_rw_site The standard deviation of the random walk along year per
#' site on the log-scale.
#' @param sd_noise The standard deviation of the noise effects on the log-scale.
#' @param size The size parameter of the negative binomial distribution.
#' @param n_run The number of runs with the same mu.
#' @param as_list Return the dataset as a list rather than a data.frame.
#' Defaults to `FALSE`.
#' @param details Add variables containing the year, period and site effects.
#' Defaults tot `FALSE`.
#' @export
#' @return A `data.frame` with five variables.
#' `Year`, `Month` and `Site` are factors identifying the location and time of
#' monitoring.
#' `Mu` is the true mean of the negative binomial distribution in the original
#' scale.
#' `Count` are the simulated counts.
#' @importFrom stats rnorm rnbinom
#' @importFrom dplyr group_by group_map mutate select
#' @importFrom rlang .data !!!
generate_data <- function(
intercept = 2, n_year = 24, n_period = 6, n_site = 20, year_factor = FALSE,
period_factor = FALSE, site_factor = FALSE, trend = 0.01, sd_rw_year = 0.1,
amplitude_period = 1, mean_phase_period = 0, sd_phase_period = 0.2,
sd_site = 1, sd_rw_site = 0.02, sd_noise = 0.01, size = 2, n_run = 10,
as_list = FALSE, details = FALSE
) {
#generate the design
dataset <- expand.grid(
Year = seq_len(n_year), Period = seq_len(n_period), Site = seq_len(n_site)
)
year_rw_effect <- cumsum(rnorm(n_year, mean = 0, sd = sd_rw_year))
phase <- rnorm(n_year + 1, mean = mean_phase_period, sd = sd_phase_period)
site_rw_effect <- rbind(
rnorm(n_site, mean = 0, sd = sd_site),
matrix(
rnorm((n_year - 1) * n_site, mean = 0, sd = sd_rw_site),
ncol = n_site
)
)
site_rw_effect <- as.vector(apply(site_rw_effect, 2, cumsum))
observation_effect <- rnorm(nrow(dataset), mean = 0, sd = sd_noise)
#generate the true mean
dataset$Mu <- exp(
intercept +
year_rw_effect[dataset$Year] + trend * dataset$Year +
amplitude_period *
sin(
dataset$Period * pi / n_period + phase[dataset$Year]
) +
site_rw_effect[dataset$Year + (dataset$Site - 1) * n_year] +
observation_effect
)
if (details) {
dataset$YearEffect <- exp(
year_rw_effect[dataset$Year] + trend * dataset$Year
)
dataset$PeriodEffect <- exp(
amplitude_period *
sin(dataset$Period * pi / n_period + phase[dataset$Year])
)
dataset$SiteEffect <- exp(
site_rw_effect[dataset$Year + (dataset$Site - 1) * n_year]
)
}
if (year_factor) {
dataset$Year <- factor(dataset$Year)
}
if (period_factor) {
dataset$Period <- factor(dataset$Period)
}
if (site_factor) {
dataset$Site <- factor(dataset$Site)
}
# add the runs
dataset <- merge(
dataset,
data.frame(Run = seq_len(n_run))
)
#generate the counts
dataset$Count <- rnbinom(nrow(dataset), size = size, mu = dataset$Mu)
if (!as_list) {
return(dataset)
}
dataset |>
group_by(.data$Run) |>
group_map(~relevant(.x, details = details, run = .y))
}
# internal function for generate_data()
#' @importFrom dplyr mutate select
#' @importFrom rlang !!!
relevant <- function(x, details, run) {
if (details) {
dots <- c("Year", "Period", "Site", "Mu", "YearEffect", "PeriodEffect",
"SiteEffect", "Count")
} else {
dots <- c("Year", "Period", "Site", "Mu", "Count")
}
x |>
select(!!!dots) |>
mutate(Run = run$Run) |>
as.data.frame()
}