If all samples were actually equally likely, then the sum of $1$s drawn from the population is distributed as hypergeometric with $p = n\times \text{popMean}$ "good" items, $n-p = n*(1-\text{popMean})$ "bad" items, and $k$ draws. This random variable $X$ is distributed with
$$E(X) = k\text{popMean} = \frac{kp}{n}$$
and $$var(X) = \frac{k\text{popMean}(1-\text{popMean})(n-k)}{n-1} = \frac{kp(1-p)(n-k)}{n^2(n-1)}.$$
Instead of the sum, we look at the mean of the $k$ draws, $\bar{X}$. It has a scaled hypergeometric distribution, with
$$E(\bar{X}) = \text{popMean} = \frac{p}{n}$$
and $$var(\bar{X}) = \frac{\text{popMean}(1-\text{popMean})(n-k)}{k(n-1)} = \frac{p(1-p)(n-k)}{kn^2(n-1)}.$$
Finally, we sample from this distribution $B$ times and take the sample average -- this is the column Sample Mean in the table. This is an average of IID random variables, so it has mean $\text{popMean} = \frac{p}{n}$ and variance
$$\frac{\text{popMean}(1-\text{popMean})(n-k)}{Bk(n-1)} = \frac{p(1-p)(n-k)}{Bkn^2(n-1)}.$$

In [1]:
getEmpiricalDistr <- function(n, k, reps=1e7){
  samReplicates <- t(replicate(reps, sort(sample(n, k))))
  uniqueSampleVec <- unique(samReplicates)
  uniqueSamples <- vector("list", nrow(uniqueSampleVec))
  for(i in seq_along(uniqueSamples)){
    sam <- uniqueSampleVec[i, ]
    uniqueSamples[[i]]$sample <- sam
    uniqueSamples[[i]]$freq <- sum(apply(samReplicates, 1, function(row) all(row==sam)))
  }
  return(uniqueSamples)
}

                                         
getEmpiricalDistrRaw <- function(n, k, reps=1e7){
    # Just generates the samples and leaves them in matrix form,
    # one per row, unsorted.
    samReplicates <- t(replicate(reps, sort(sample(n, k))))
    return(samReplicates)
}
                                         
                                         
findFreqItems <- function(samReplicates, m){
  # Return indices of the m most frequently occurring items
  countSorted <- sort(table(samReplicates), decreasing = TRUE)
  return(as.numeric(names(countSorted)[1:m]))
}
                                         

makePopulation <- function(n, p){
    # Create a population of 0s and 1s
    # n = pop size
    # p = number of 1s in the population

    x <- rep(0, n)
    x[1:p] <- 1
    return(x)
}
     
                                         
makeAdversarialPopulation <- function(n, indices){
    # Create a population of 0s and 1s
    # n = pop size
    # indices = location of 1s
    x <- rep(0, n)
    x[indices] <- 1
    return(x)
}

                                         
getPopMean <- function(x){
    return(mean(x))
}


getSampleMean <- function(x, samReplicates){
    # Takes input:
    # x = population
    # samReplicates = samples. output from getEmpiricalDistrRaw, not getEmpiricalDistr
    sampMeans <- apply(samReplicates, 1, function(sam) mean(x[sam]))
    return(mean(sampMeans))
}

# Super-Duper

In [2]:
# Boilerplate stuff

reps <- 1e4
n <- c(13, 30, 90)
k <- c(4, 10, 20)
p <- c(5, 10, 20)

popMean <- c()
sampleMean <- c()
nvalues <- c()
kvalues <- c()
prng <- c()
seed <- c()
bias <- c()
relBias <- c()
theoreticalSE <- c()

In [3]:
seedvalues = c(100, 233424280)

for(nn in n){
  for(kk in k){
    for(pp in p){
      if(pp >= nn | kk >= nn){
        next
      }
      for(ss in seedvalues){
        set.seed(ss, kind = "Super-Duper")
        
        itemCounts <- getEmpiricalDistrRaw(n=nn, k=kk, reps=reps)
        most_freq_p <- findFreqItems(itemCounts, pp)
         
        x <- makeAdversarialPopulation(nn, most_freq_p)
        #x <- makePopulation(nn, pp)
        truePopMean <- getPopMean(x)
        popMean <- c(popMean, truePopMean)
        obsSampMean <- getSampleMean(x, itemCounts)
        sampleMean <- c(sampleMean, obsSampMean)
        nvalues <- c(nvalues, nn)
        kvalues <- c(kvalues, kk)
        prng <- c(prng, "Super-Duper")
        seed <- c(seed, ss)
        
        estimBias <- obsSampMean - truePopMean
        bias <- c(bias, estimBias)
        relBias <- c(relBias, estimBias/truePopMean)
        
        theoreticalSE <- c(theoreticalSE,
                           sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1))))
      }
    }
  }
}

In [4]:
cbind("Pop size" = nvalues, 
      "Sample size" = kvalues, 
      "seed" = seedvalues, 
      "Pop Mean" = popMean, 
      "Sample Mean" = sampleMean, 
      "Bias" = bias, 
      "Relative bias" = relBias, 
      "Theoretical SE" = theoreticalSE,
      "Bias/Theoretical SE" = bias/theoreticalSE 
)

Pop size,Sample size,seed,Pop Mean,Sample Mean,Bias,Relative bias,Theoretical SE,Bias/Theoretical SE
13,4,100,0.38461538,0.390175,0.005559615,0.014455,0.0021066252,2.63911
13,4,233424280,0.38461538,0.3897,0.005084615,0.01322,0.0021066252,2.413631
13,4,100,0.76923077,0.77415,0.004919231,0.006395,0.001824391,2.696369
13,4,233424280,0.76923077,0.77375,0.004519231,0.005875,0.001824391,2.477118
13,10,100,0.38461538,0.38654,0.001924615,0.005004,0.0007692308,2.502
13,10,233424280,0.38461538,0.38665,0.002034615,0.00529,0.0007692308,2.645
13,10,100,0.76923077,0.77085,0.001619231,0.002105,0.0006661734,2.430645
13,10,233424280,0.76923077,0.77041,0.001179231,0.001533,0.0006661734,1.770156
30,4,100,0.16666667,0.17255,0.005883333,0.0353,0.0017643772,3.33451
30,4,233424280,0.16666667,0.1729,0.006233333,0.0374,0.0017643772,3.53288


# Mersenne Twister

In [7]:
# Boilerplate stuff

reps <- 1e4
n <- c(13, 30, 90)
k <- c(4, 10, 20)
p <- c(5, 10, 20)

popMean <- c()
sampleMean <- c()
nvalues <- c()
kvalues <- c()
prng <- c()
seed <- c()
bias <- c()
relBias <- c()
theoreticalSE <- c()

In [8]:
seedvalues = c(100, 233424280, 429496729)

for(nn in n){
  for(kk in k){
    for(pp in p){
      if(pp >= nn | kk >= nn){
        next
      }
      for(ss in seedvalues){
        set.seed(ss, kind = "Mersenne-Twister")
        
        itemCounts <- getEmpiricalDistrRaw(n=nn, k=kk, reps=reps)
        most_freq_p <- findFreqItems(itemCounts, pp)
         
        x <- makeAdversarialPopulation(nn, most_freq_p)
        #x <- makePopulation(nn, pp)
        truePopMean <- getPopMean(x)
        popMean <- c(popMean, truePopMean)
        obsSampMean <- getSampleMean(x, itemCounts)
        sampleMean <- c(sampleMean, obsSampMean)
        nvalues <- c(nvalues, nn)
        kvalues <- c(kvalues, kk)
        prng <- c(prng, "Mersenne Twister")
        seed <- c(seed, ss)
        
        estimBias <- obsSampMean - truePopMean
        bias <- c(bias, estimBias)
        relBias <- c(relBias, estimBias/truePopMean)
        
        theoreticalSE <- c(theoreticalSE,
                           sqrt(truePopMean*(1-truePopMean)*(nn-kk)/(reps * kk * (nn-1))))
      }
    }
  }
}

In [9]:
cbind("Pop size" = nvalues, 
      "Sample size" = kvalues, 
      "seed" = seedvalues, 
      "Pop Mean" = popMean, 
      "Sample Mean" = sampleMean, 
      "Bias" = bias, 
      "Relative bias" = relBias, 
      "Theoretical SE" = theoreticalSE,
      "Bias/Theoretical SE" = bias/theoreticalSE 
)

Pop size,Sample size,seed,Pop Mean,Sample Mean,Bias,Relative bias,Theoretical SE,Bias/Theoretical SE
13,4,100,0.3846154,0.390350,0.005734615,0.0149100,0.0021066252,2.722181
13,4,233424280,0.3846154,0.390100,0.005484615,0.0142600,0.0021066252,2.603508
13,4,429496729,0.3846154,0.392025,0.007409615,0.0192650,0.0021066252,3.517292
13,4,100,0.7692308,0.774200,0.004969231,0.0064600,0.0018243910,2.723775
13,4,233424280,0.7692308,0.774450,0.005219231,0.0067850,0.0018243910,2.860807
13,4,429496729,0.7692308,0.773525,0.004294231,0.0055825,0.0018243910,2.353789
13,10,100,0.3846154,0.386230,0.001614615,0.0041980,0.0007692308,2.099000
13,10,233424280,0.3846154,0.386630,0.002014615,0.0052380,0.0007692308,2.619000
13,10,429496729,0.3846154,0.386360,0.001744615,0.0045360,0.0007692308,2.268000
13,10,100,0.7692308,0.770540,0.001309231,0.0017020,0.0006661734,1.965300
