RCode/SimulateWeightedTTestClusters.R

# Function to generate 1 dataset with clustering
createData.fnc <- function(ICC = 0.1, # intraclass correlation coefficient
                           clusterSizes = c(10, 50, 13, 80, 14, 86, 62, 45, 41, 8), # cluster sizes
                           treatment = 0) { # treatment effect (= 0 for simulating null hypothesis)
  
  # Calculate between-cluster variance:
  # ICC = var.between / (var.within + var.between)
  # here: var.within = 1
  # hence: ICC = var.between / (1 + var.between) 
  # <=> ICC + ICC*var.between = var.between
  # <=> var.between - ICC*var.between = ICC 
  # <=> var.beween*(1-ICC) = ICC <=> var.between = ICC / (1-ICC)
  var.between <- (ICC)/(1-ICC)

  # Generate data frame with information about clusters
  Clusters <- data.frame(cluster = factor(1:length(clusterSizes)), # generate k clusters
                         effectCluster = rnorm(length(clusterSizes), 0, sqrt(var.between)), # cluster effect (drawn from normal distribution with sd = sqrt(var.between))
                         conditionCluster = c(rep("control", floor(length(clusterSizes)/2)), # divide clusters into 'control' and 'intervention' clusters
                                              rep("intervention", ceiling(length(clusterSizes)/2)))
  )
  
  # Generate data frame with information about participants
  Data <- data.frame(cluster = rep(factor(1:length(clusterSizes)), clusterSizes), # each participant belongs to a cluster
                     effectParticipants = rnorm(sum(clusterSizes), 0, 1) # participant effect (drawn from normal distribution with sd = sqrt(1) (= var.within))
  )
  
  # Combine cluster and participant information
  Data <- merge(Data, Clusters, by = "cluster")

  # Add cluster effect to participant effect and add treatment effect
  Data$outcome = Data$effectParticipants + Data$effectCluster + (as.numeric(Data$conditionCluster)-1)*treatment
  
  # Return dataset
  return(Data)
}

# Load plyr package (for summarising data)
library(plyr)

# Function for analysing data generated by createData.fnc()
analyseCluster.fnc <- function(dataset) {
  
  # Compute mean and number of observations in each cluster
  dat.sum <- ddply(dataset, .(cluster, conditionCluster), summarise,
                   meanOutcome = mean(outcome),
                   n = length(outcome))
  
  # Compute p-value for t-test on outcomes (ignoring clustering)
  # (anova = t-test here)
  ttestIgnore <- anova(lm(outcome ~ conditionCluster, dataset))$'Pr(>F)'[1]
  
  # Compute p-value for t-test on cluster means, weighted for cluster size
  ttestWeighted <- anova(lm(meanOutcome ~ conditionCluster, dat.sum, weights = n)))$'Pr(>F)'[1]
  
  # Compute p-value for t-test on cluster means, unweighted
  ttestUnweighted <- anova(lm(meanOutcome ~ conditionCluster, dat.sum))$'Pr(>F)'[1]
  
  return(list(ttestIgnore,
              ttestWeighted,
              ttestUnweighted
  )
  )
}

# Combine these two function
simulate.fnc <- function(ICC = 0.1, 
                         clusterSizes = c(10, 50, 13, 80, 14, 86, 62, 45, 41, 8),
                         treatment = 0) {
  Data <- createData.fnc(ICC = 0.1,
                         clusterSizes = sample(clusterSizes), # resample (wo replacement) clusterSizes: you don't know beforehand which cluster ends up in which condition
                         treatment = treatment)
  
  pVals <- analyseCluster.fnc(Data)
  return(pVals)
}

# Now run the simulation 10,000 times and see how many p-values are below 0.05 (should be 5%)
pvals <- replicate(1e4, simulate.fnc(clusterSizes = c(10, 50, 13, 80, 14, 86, 62, 45, 41, 8)))
mean(unlist(pvals[1,] <= 0.05)) # 44%
mean(unlist(pvals[2,] <= 0.05)) # 9%
mean(unlist(pvals[3,] <= 0.05)) # 5%