/
SimulateWeightedTTestClusters.R
81 lines (65 loc) · 3.61 KB
/
SimulateWeightedTTestClusters.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Function to generate 1 dataset with clustering
createData.fnc <- function(ICC = 0.1, # intraclass correlation coefficient
clusterSizes = c(10, 50, 13, 80, 14, 86, 62, 45, 41, 8), # cluster sizes
treatment = 0) { # treatment effect (= 0 for simulating null hypothesis)
# Calculate between-cluster variance:
# ICC = var.between / (var.within + var.between)
# here: var.within = 1
# hence: ICC = var.between / (1 + var.between)
# <=> ICC + ICC*var.between = var.between
# <=> var.between - ICC*var.between = ICC
# <=> var.beween*(1-ICC) = ICC <=> var.between = ICC / (1-ICC)
var.between <- (ICC)/(1-ICC)
# Generate data frame with information about clusters
Clusters <- data.frame(cluster = factor(1:length(clusterSizes)), # generate k clusters
effectCluster = rnorm(length(clusterSizes), 0, sqrt(var.between)), # cluster effect (drawn from normal distribution with sd = sqrt(var.between))
conditionCluster = c(rep("control", floor(length(clusterSizes)/2)), # divide clusters into 'control' and 'intervention' clusters
rep("intervention", ceiling(length(clusterSizes)/2)))
)
# Generate data frame with information about participants
Data <- data.frame(cluster = rep(factor(1:length(clusterSizes)), clusterSizes), # each participant belongs to a cluster
effectParticipants = rnorm(sum(clusterSizes), 0, 1) # participant effect (drawn from normal distribution with sd = sqrt(1) (= var.within))
)
# Combine cluster and participant information
Data <- merge(Data, Clusters, by = "cluster")
# Add cluster effect to participant effect and add treatment effect
Data$outcome = Data$effectParticipants + Data$effectCluster + (as.numeric(Data$conditionCluster)-1)*treatment
# Return dataset
return(Data)
}
# Load plyr package (for summarising data)
library(plyr)
# Function for analysing data generated by createData.fnc()
analyseCluster.fnc <- function(dataset) {
# Compute mean and number of observations in each cluster
dat.sum <- ddply(dataset, .(cluster, conditionCluster), summarise,
meanOutcome = mean(outcome),
n = length(outcome))
# Compute p-value for t-test on outcomes (ignoring clustering)
# (anova = t-test here)
ttestIgnore <- anova(lm(outcome ~ conditionCluster, dataset))$'Pr(>F)'[1]
# Compute p-value for t-test on cluster means, weighted for cluster size
ttestWeighted <- anova(lm(meanOutcome ~ conditionCluster, dat.sum, weights = n)))$'Pr(>F)'[1]
# Compute p-value for t-test on cluster means, unweighted
ttestUnweighted <- anova(lm(meanOutcome ~ conditionCluster, dat.sum))$'Pr(>F)'[1]
return(list(ttestIgnore,
ttestWeighted,
ttestUnweighted
)
)
}
# Combine these two function
simulate.fnc <- function(ICC = 0.1,
clusterSizes = c(10, 50, 13, 80, 14, 86, 62, 45, 41, 8),
treatment = 0) {
Data <- createData.fnc(ICC = 0.1,
clusterSizes = sample(clusterSizes), # resample (wo replacement) clusterSizes: you don't know beforehand which cluster ends up in which condition
treatment = treatment)
pVals <- analyseCluster.fnc(Data)
return(pVals)
}
# Now run the simulation 10,000 times and see how many p-values are below 0.05 (should be 5%)
pvals <- replicate(1e4, simulate.fnc(clusterSizes = c(10, 50, 13, 80, 14, 86, 62, 45, 41, 8)))
mean(unlist(pvals[1,] <= 0.05)) # 44%
mean(unlist(pvals[2,] <= 0.05)) # 9%
mean(unlist(pvals[3,] <= 0.05)) # 5%